Esempio n. 1
0
    def test_intializing_embeds_from_config(self):
        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.RANDOM,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )

        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test random initialization (values should be non-0)
        np.testing.assert_array_less(
            [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy())
        )

        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.ZERO,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )
        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test zero initialization (values should all be 0)
        np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
Esempio n. 2
0
    def _create_dummy_data_handler(self):
        feat = WordFeatConfig(
            vocab_size=4,
            vocab_from_all_data=True,
            vocab_from_train_data=True,
            vocab_from_pretrained_embeddings=False,
            pretrained_embeddings_path=None,
        )
        featurizer = create_featurizer(SimpleFeaturizer.Config(),
                                       FeatureConfig(word_feat=feat))
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        train_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                              include_label_fields=False)
        eval_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        test_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        data_handler.init_feature_metadata(train_data, eval_data, test_data)

        return data_handler
    def test_split_with_regex(self):
        featurizer = SimpleFeaturizer.from_config(
            SimpleFeaturizer.Config(split_regex=r"[\s,;!.?\"\(\)\-]+"),
            FeatureConfig())
        sentence = """
            Your bones don't break, mine do. That's clear. Your cells react to
            bacteria and viruses differently than mine. You don't get sick,
            I do. That's also clear. But for some reason, you and I react the
            exact same way to water. We swallow it too fast, we choke. We get
            some in our lungs, we drown. However unreal it may seem, we are
            connected, you and I. We're on the same curve, just on opposite
            ends.
        """
        expected = """
            your bones don't break mine do that's clear your cells react to
            bacteria and viruses differently than mine you don't get sick
            i do that's also clear but for some reason you and i react the
            exact same way to water we swallow it too fast we choke we get
            some in our lungs we drown however unreal it may seem we are
            connected you and i we're on the same curve just on opposite ends
        """.split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)

        sentence = '"Please, buy me a coffee?" He implored-in vain.'
        expected = "please buy me a coffee he implored in vain".split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)
Esempio n. 4
0
    def test_read_partially_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2}

        feat = WordFeatConfig(
            vocab_from_all_data=True,
            vocab_from_train_data=False,
            vocab_from_pretrained_embeddings=False,
        )
        featurizer = create_featurizer(
            SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)
        )
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        data = list(data_handler.read_from_file(file_name, columns))
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
Esempio n. 5
0
    def test_tokenize(self):
        featurizer = SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                  FeatureConfig())

        tokens = featurizer.featurize(
            InputRecord(raw_text="At eight o'clock")).tokens
        self.assertEqual(['at', 'eight', "o'clock"], tokens)
Esempio n. 6
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True,
                                 lowercase_tokens=False),
         FeatureConfig(),
     )
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(
         tokens,
         [
             "O",
             "r",
             "d",
             "e",
             "r",
             " ",
             "m",
             "e",
             " ",
             "a",
             " ",
             "c",
             "o",
             "f",
             "f",
             "e",
             "e",
         ],
     )
Esempio n. 7
0
 def test_uppercase_tokens(self):
     """
     Test that the text is not lower-cased when lowercase_tokens is False.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=1)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=False),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {
             "<unk>",
             "What",
             "EVENTS",
             "can",
             "I",
             "go",
             "today",
             "Are",
             "there",
             "any",
             "adult",
             "events",
             "this",
             "weekend",
         },
     )
 def setUp(self):
     self.data_handler = JointModelDataHandler.from_config(
         JointModelDataHandler.Config(),
         FeatureConfig(),
         [DocLabelConfig(), WordLabelConfig()],
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
 def test_tokenize_add_sentence_markers(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")),
         FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens,
                          ["<s>", "order", "me", "a", "coffee", "</s>"])
Esempio n. 10
0
 class Config(ConfigBase):
     features: FeatureConfig = FeatureConfig()
     featurizer: Featurizer.Config = SimpleFeaturizer.Config()
     data_handler: DataHandler.Config
     trainer: Trainer.Config = Trainer.Config()
     optimizer: Optimizer.Config = Adam.Config()
     scheduler: Optional[Scheduler.Config] = Scheduler.Config()
     exporter: Optional[ModelExporter.Config] = None
Esempio n. 11
0
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = ["Order", "me", "a", "coffee"]
     expected_chars = [list(tok) for tok in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
 def setUp(self):
     handler_config = DocClassificationDataHandler.Config()
     handler_config.columns_to_read.append(ModelInput.DENSE_FEAT)
     self.data_handler = DocClassificationDataHandler.from_config(
         DocClassificationDataHandler.Config(),
         ModelInputConfig(),
         [],
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
Esempio n. 13
0
 def _init_data_handler(self):
     data_handler = LanguageModelDataHandler.from_config(
         LanguageModelDataHandler.Config(),
         FeatureConfig(),
         WordLabelConfig(),
         featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()),
         shuffle=False,
     )
     data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME)
     return data_handler
Esempio n. 14
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False),
         FeatureConfig(),
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = list("Order me a coffee")
     expected_chars = [list(char) for char in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
    def setUp(self):
        file_name = tests_module.test_file(
            "contextual_intent_slot_train_tiny.tsv")
        self.dh = ContextualIntentSlotModelDataHandler.from_config(
            ContextualIntentSlotModelDataHandler.Config(),
            ModelInputConfig(),
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(),
                                        ModelInputConfig()),
        )

        self.data = self.dh.read_from_file(file_name, self.dh.raw_columns)
    def setUp(self):
        simple_featurizer_config = SimpleFeaturizer.Config()
        simple_featurizer_config.split_regex = r""
        simple_featurizer_config.convert_to_bytes = True

        self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config(
            QueryDocumentPairwiseRankingDataHandler.Config(),
            ModelInputConfig(),
            [],
            featurizer=SimpleFeaturizer.from_config(simple_featurizer_config,
                                                    FeatureConfig()),
        )
Esempio n. 17
0
    def create_language_model_data_handler(cls) -> LanguageModelDataHandler:
        # TODO: Refactor this after Shicong refactors PyText config and removes
        # Thrift. After that directly use Data Handler's from config method
        # with synthetic configs
        columns = [DFColumn.UTTERANCE]
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD: TextFeatureField(
                eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN
            )
        }

        return LanguageModelDataHandler(
            raw_columns=columns,
            features=features,
            labels={},
            featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()),
        )
Esempio n. 18
0
 def setUp(self):
     file_name = tests_module.test_file(
         "knowledge_distillation_test_tiny.tsv")
     label_config_dict = {"target_prob": True}
     data_handler_dict = {
         "columns_to_read":
         ["text", "target_probs", "target_labels", "doc_label"]
     }
     self.data_handler = KDDocClassificationDataHandler.from_config(
         KDDocClassificationDataHandler.Config(**data_handler_dict),
         ModelInputConfig(),
         TargetConfig(**label_config_dict),
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
     self.data = self.data_handler.read_from_file(
         file_name, self.data_handler.raw_columns)
Esempio n. 19
0
    def test_read_file_with_dense_features(self):
        data_handler_config = ContextualIntentSlotModelDataHandler.Config()
        data_handler_config.columns_to_read.append(ModelInput.DENSE)
        dense_file_name = tests_module.test_file(
            "contextual_intent_slot_train_tiny_dense.tsv")
        data_handler = ContextualIntentSlotModelDataHandler.from_config(
            data_handler_config,
            ModelInputConfig(),
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(),
                                        ModelInputConfig()),
        )

        dense_data = list(
            data_handler.read_from_file(dense_file_name,
                                        data_handler.raw_columns))
        self.assertEqual(dense_data[0][ModelInput.DENSE], "[0,1,2,3,4]")
Esempio n. 20
0
    def setup_data(self):
        simple_featurizer_config = SimpleFeaturizer.Config()
        simple_featurizer_config.split_regex = r""
        simple_featurizer_config.convert_to_bytes = True

        self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config(
            QueryDocumentPairwiseRankingDataHandler.Config(),
            ModelInputConfig(),
            [],
            featurizer=SimpleFeaturizer.from_config(simple_featurizer_config,
                                                    FeatureConfig()),
        )
        self.file_name = tests_module.test_file(
            "query_document_pairwise_ranking_tiny.tsv")
        self.data_handler.shuffle = False
        self.data_handler.init_metadata_from_path(self.file_name,
                                                  self.file_name,
                                                  self.file_name)
 def test_min_freq(self):
     """
     Test that UNKification is triggered when min_freq is 2.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=2)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=True),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     # <unk>-NUM = <unk> for numeric tokens
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"},
     )
Esempio n. 22
0
    def test_data_handler(self):
        data_handler = BPTTLanguageModelDataHandler.from_config(
            BPTTLanguageModelDataHandler.Config(bptt_len=4),
            FeatureConfig(),
            WordLabelConfig(),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), FeatureConfig()
            ),
        )
        data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME)

        train_iter = data_handler.get_train_iter_from_path(FILE_NAME, BATCH_SIZE)

        batches = [t for t in train_iter]
        # There are two batches in the tiny dataset
        self.assertEqual(len(batches), 2)

        # batches of tuple(input, target, context)
        # input -> tuple(input_sequences, sequence_length)
        # input_sequence -> tensor of dim (bsize, max_seq_length)
        np.testing.assert_array_equal(
            batches[0][0][0],
            [[15, 19, 12, 16], [3, 13, 21, 8], [20, 7, 23, 4], [6, 5, 7, 22]],
        )
        # sequence_length -> tensor of dim (bsize)
        np.testing.assert_array_equal(batches[0][0][1], [4, 4, 4, 4])

        # target -> tensor of same dim as input_sequences (bsize, max_seq_length)
        np.testing.assert_array_equal(
            batches[0][1][0],
            [[19, 12, 16, 14], [13, 21, 8, 3], [7, 23, 4, 3], [5, 7, 22, 10]],
        )

        np.testing.assert_array_equal(
            batches[1][0][0], [[14, 17, 11], [3, 5, 18], [3, 8, 4], [10, 4, 9]]
        )
        np.testing.assert_array_equal(batches[1][0][1], [3, 3, 3, 3])
        np.testing.assert_array_equal(
            batches[1][1][0], [[17, 11, 4], [5, 18, 6], [8, 4, 3], [4, 9, 1]]
        )
Esempio n. 23
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "cu:discuss_where",
            DFColumn.UTTERANCE:
            '["where do you wanna meet?", "MPK"]',
        }]

        self.eval_data = [
            {
                DFColumn.DOC_LABEL: "cu:discuss_where",
                DFColumn.UTTERANCE: '["how about SF?", "sounds good"]',
            },
            {
                DFColumn.DOC_LABEL: "cu:other",
                DFColumn.UTTERANCE: '["lol"]'
            },
        ]

        self.test_data = [
            {
                DFColumn.DOC_LABEL: "cu:discuss_where",
                DFColumn.UTTERANCE: '["MPK sounds good to me"]',
            },
            {
                DFColumn.DOC_LABEL: "cu:other",
                DFColumn.UTTERANCE: '["great", "awesome"]',
            },
        ]

        self.dh = SeqModelDataHandler.from_config(
            SeqModelDataHandler.Config(),
            FeatureConfig(),
            DocLabelConfig(),
            featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                    FeatureConfig()),
        )
Esempio n. 24
0
    def test_init_feature_metadata(self):
        # Specify data
        feat_name = ModelInput.WORD_FEAT
        train_text = "Hi there you"
        eval_text = ""
        test_text = "Go away"
        pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw")
        pretrained_tokens = {
            "</s>",
            "the",
            "to",
            "and",
            "a",
            "I",
            "you",
            "is",
            "aloha",
            "for",
        }

        # Specify test cases
        test_cases = (
            # Vocab from train / eval / test data
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=True,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=False,
                ),
                "expected_tokens": {
                    "hi",
                    "there",
                    "you",
                    "go",
                    "away",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                "expected_num_pretrained_tokens": 0,
            },
            # Vocab from train data or pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=True,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                ),
                "expected_tokens": pretrained_tokens.union(
                    {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN}
                ),
                "expected_num_pretrained_tokens": len(pretrained_tokens) + 4,
            },
            # Vocab from limited number of pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                    vocab_size=2,
                ),
                "expected_tokens": {
                    "</s>",
                    "the",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                # special tokens excluded from vocab_size = 2
                "expected_num_pretrained_tokens": 4,
            },
        )

        for case in test_cases:
            # Setup data handler
            featurizer = create_featurizer(
                SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"])
            )
            data_handler = DocClassificationDataHandler.from_config(
                DocClassificationDataHandler.Config(),
                ModelInputConfig(word_feat=case["feat"]),
                TargetConfig(),
                featurizer=featurizer,
            )
            train_data = data_handler.gen_dataset(
                [{"text": train_text}], include_label_fields=False
            )
            eval_data = data_handler.gen_dataset(
                [{"text": eval_text}], include_label_fields=False
            )
            test_data = data_handler.gen_dataset(
                [{"text": test_text}], include_label_fields=False
            )
            data_handler.init_feature_metadata(train_data, eval_data, test_data)

            # Check created vocab
            meta = data_handler.metadata.features[feat_name]
            self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"])
            if case["expected_num_pretrained_tokens"] == 0:
                self.assertIsNone(meta.pretrained_embeds_weight)
            else:
                self.assertEqual(
                    meta.pretrained_embeds_weight.size(0),
                    case["expected_num_pretrained_tokens"],
                )
Esempio n. 25
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [{
                "id": "SL:DATE_TIME",
                "span": {
                    "start": 21,
                    "end": 26
                },
                "text": "today",
            }],
            DFColumn.UTTERANCE:
            "What EVENTS can I go today",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]",
        }]

        self.eval_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ATTRIBUTE_EVENT",
                    "span": {
                        "start": 14,
                        "end": 19
                    },
                    "text": "adult",
                },
                {
                    "id": "SL:DATE_TIME",
                    "span": {
                        "start": 27,
                        "end": 39
                    },
                    "text": "this weekend",
                },
            ],
            DFColumn.UTTERANCE:
            "Are there any adult events this weekend",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]",
        }]

        self.test_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_INFO_ROAD_CONDITION",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ROAD_CONDITION",
                    "span": {
                        "start": 9,
                        "end": 21
                    },
                    "text": "any flooding",
                },
                {
                    "id": "SL:DESTINATION",
                    "span": {
                        "start": 36,
                        "end": 41
                    },
                    "text": "Karen",
                    "subframe": {
                        "utterance":
                        "Karen",
                        "domain":
                        "",
                        "intent":
                        "IN:GET_LOCATION_HOME",
                        "slots": [{
                            "id": "SL:CONTACT",
                            "span": {
                                "start": 0,
                                "end": 5
                            },
                            "text": "Karen",
                        }],
                        "span": {
                            "start": 0,
                            "end": 5
                        },
                    },
                },
            ],
            DFColumn.UTTERANCE:
            "Is there any flooding on the way to Karen's?",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]",
        }]

        self.dh = CompositionalDataHandler.from_config(
            CompositionalDataHandler.Config(),
            FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                   min_freq=1)),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(lowercase_tokens=True),
                FeatureConfig()),
        )
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens, ["Order", "me", "a", "coffee"])