Example #1
0
    def _create_dummy_data_handler(self):
        feat = WordFeatConfig(
            vocab_size=4,
            vocab_from_all_data=True,
            vocab_from_train_data=True,
            vocab_from_pretrained_embeddings=False,
            pretrained_embeddings_path=None,
        )
        featurizer = create_featurizer(SimpleFeaturizer.Config(),
                                       FeatureConfig(word_feat=feat))
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        train_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                              include_label_fields=False)
        eval_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        test_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        data_handler.init_feature_metadata(train_data, eval_data, test_data)

        return data_handler
Example #2
0
    def test_read_partially_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2}

        feat = WordFeatConfig(
            vocab_from_all_data=True,
            vocab_from_train_data=False,
            vocab_from_pretrained_embeddings=False,
        )
        featurizer = create_featurizer(
            SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)
        )
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        data = list(data_handler.read_from_file(file_name, columns))
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
Example #3
0
    def test_init_feature_metadata(self):
        # Specify data
        feat_name = ModelInput.WORD_FEAT
        train_text = "Hi there you"
        eval_text = ""
        test_text = "Go away"
        pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw")
        pretrained_tokens = {
            "</s>",
            "the",
            "to",
            "and",
            "a",
            "I",
            "you",
            "is",
            "aloha",
            "for",
        }

        # Specify test cases
        test_cases = (
            # Vocab from train / eval / test data
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=True,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=False,
                ),
                "expected_tokens": {
                    "hi",
                    "there",
                    "you",
                    "go",
                    "away",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                "expected_num_pretrained_tokens": 0,
            },
            # Vocab from train data or pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=True,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                ),
                "expected_tokens": pretrained_tokens.union(
                    {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN}
                ),
                "expected_num_pretrained_tokens": len(pretrained_tokens) + 4,
            },
            # Vocab from limited number of pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                    vocab_size=2,
                ),
                "expected_tokens": {
                    "</s>",
                    "the",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                # special tokens excluded from vocab_size = 2
                "expected_num_pretrained_tokens": 4,
            },
        )

        for case in test_cases:
            # Setup data handler
            featurizer = create_featurizer(
                SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"])
            )
            data_handler = DocClassificationDataHandler.from_config(
                DocClassificationDataHandler.Config(),
                ModelInputConfig(word_feat=case["feat"]),
                TargetConfig(),
                featurizer=featurizer,
            )
            train_data = data_handler.gen_dataset(
                [{"text": train_text}], include_label_fields=False
            )
            eval_data = data_handler.gen_dataset(
                [{"text": eval_text}], include_label_fields=False
            )
            test_data = data_handler.gen_dataset(
                [{"text": test_text}], include_label_fields=False
            )
            data_handler.init_feature_metadata(train_data, eval_data, test_data)

            # Check created vocab
            meta = data_handler.metadata.features[feat_name]
            self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"])
            if case["expected_num_pretrained_tokens"] == 0:
                self.assertIsNone(meta.pretrained_embeds_weight)
            else:
                self.assertEqual(
                    meta.pretrained_embeds_weight.size(0),
                    case["expected_num_pretrained_tokens"],
                )