def test_intializing_embeds_from_config(self): feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.RANDOM, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test random initialization (values should be non-0) np.testing.assert_array_less( [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy()) ) feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.ZERO, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test zero initialization (values should all be 0) np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
def _create_dummy_data_handler(self): feat = WordFeatConfig( vocab_size=4, vocab_from_all_data=True, vocab_from_train_data=True, vocab_from_pretrained_embeddings=False, pretrained_embeddings_path=None, ) featurizer = create_featurizer(SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=feat), TargetConfig(), featurizer=featurizer, ) train_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) eval_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) test_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) data_handler.init_feature_metadata(train_data, eval_data, test_data) return data_handler
def test_uppercase_tokens(self): """ Test that the text is not lower-cased when lowercase_tokens is False. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), { "<unk>", "What", "EVENTS", "can", "I", "go", "today", "Are", "there", "any", "adult", "events", "this", "weekend", }, )
def test_read_partially_from_csv(self): file_name = tests_module.test_file("train_data_tiny.tsv") columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2} feat = WordFeatConfig( vocab_from_all_data=True, vocab_from_train_data=False, vocab_from_pretrained_embeddings=False, ) featurizer = create_featurizer( SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat) ) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=feat), TargetConfig(), featurizer=featurizer, ) data = list(data_handler.read_from_file(file_name, columns)) for col in columns: self.assertTrue(col in data[0], "{} must in the data".format(col)) self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL]) self.assertEqual( "change my alarm tomorrow to wake me up 30 minutes earlier", data[0][DFColumn.UTTERANCE], )
class Config(NewModel.Config, DocModel.Config): embedding: WordFeatConfig = WordFeatConfig() class ModelInput(NewModel.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) inputs: ModelInput = ModelInput()
def create_model(self, shared_rep): # shared_rep: do query and response share representation layer? metadata = self.data_handler.metadata model_config = QueryDocumentPairwiseRankingModel_Deprecated.Config() model_config.representation = QueryDocumentPairwiseRankingRep.Config() model_config.representation.shared_representations = shared_rep model_config.decoder = MLPDecoderQueryResponse.Config() model_config.decoder.hidden_dims = [64] model_config.output_layer = PairwiseRankingOutputLayer.Config() feat_config = ModelInputConfig() feat_config.pos_response = WordFeatConfig() feat_config.pos_response.embed_dim = 64 feat_config.neg_response = WordFeatConfig() feat_config.query = WordFeatConfig() return QueryDocumentPairwiseRankingModel_Deprecated.from_config( model_config, feat_config, metadata)
class Config(DocModel.Config): class ModelInput(Model.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) # for metric reporter raw_text: MetaInput.Config = MetaInput.Config(column="text") inputs: ModelInput = ModelInput() embedding: WordFeatConfig = WordFeatConfig()
def DISABLED_test_freeze_word_embedding(self): model = create_model( DocModel.Config(), FeatureConfig( word_feat=WordFeatConfig(freeze=True, mlp_layer_dims=[4]), dict_feat=DictFeatConfig(), ), metadata=mock_metadata(), ) # word embedding for param in model.embedding[0].word_embedding.parameters(): self.assertFalse(param.requires_grad) for param in model.embedding[0].mlp.parameters(): self.assertTrue(param.requires_grad) # dict feat embedding for param in model.embedding[1].parameters(): self.assertTrue(param.requires_grad)
def test_min_freq(self): """ Test that UNKification is triggered when min_freq is 2. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=2)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) # <unk>-NUM = <unk> for numeric tokens self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"}, )
def setUp(self): self.train_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [{ "id": "SL:DATE_TIME", "span": { "start": 21, "end": 26 }, "text": "today", }], DFColumn.UTTERANCE: "What EVENTS can I go today", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]", }] self.eval_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [ { "id": "SL:ATTRIBUTE_EVENT", "span": { "start": 14, "end": 19 }, "text": "adult", }, { "id": "SL:DATE_TIME", "span": { "start": 27, "end": 39 }, "text": "this weekend", }, ], DFColumn.UTTERANCE: "Are there any adult events this weekend", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]", }] self.test_data = [{ DFColumn.DOC_LABEL: "IN:GET_INFO_ROAD_CONDITION", DFColumn.WORD_LABEL: [ { "id": "SL:ROAD_CONDITION", "span": { "start": 9, "end": 21 }, "text": "any flooding", }, { "id": "SL:DESTINATION", "span": { "start": 36, "end": 41 }, "text": "Karen", "subframe": { "utterance": "Karen", "domain": "", "intent": "IN:GET_LOCATION_HOME", "slots": [{ "id": "SL:CONTACT", "span": { "start": 0, "end": 5 }, "text": "Karen", }], "span": { "start": 0, "end": 5 }, }, }, ], DFColumn.UTTERANCE: "Is there any flooding on the way to Karen's?", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]", }] self.dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), )
def test_init_feature_metadata(self): # Specify data feat_name = ModelInput.WORD_FEAT train_text = "Hi there you" eval_text = "" test_text = "Go away" pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw") pretrained_tokens = { "</s>", "the", "to", "and", "a", "I", "you", "is", "aloha", "for", } # Specify test cases test_cases = ( # Vocab from train / eval / test data { "feat": WordFeatConfig( vocab_from_all_data=True, vocab_from_train_data=False, vocab_from_pretrained_embeddings=False, ), "expected_tokens": { "hi", "there", "you", "go", "away", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, "expected_num_pretrained_tokens": 0, }, # Vocab from train data or pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=True, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, ), "expected_tokens": pretrained_tokens.union( {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN} ), "expected_num_pretrained_tokens": len(pretrained_tokens) + 4, }, # Vocab from limited number of pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=False, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, vocab_size=2, ), "expected_tokens": { "</s>", "the", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, # special tokens excluded from vocab_size = 2 "expected_num_pretrained_tokens": 4, }, ) for case in test_cases: # Setup data handler featurizer = create_featurizer( SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"]) ) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=case["feat"]), TargetConfig(), featurizer=featurizer, ) train_data = data_handler.gen_dataset( [{"text": train_text}], include_label_fields=False ) eval_data = data_handler.gen_dataset( [{"text": eval_text}], include_label_fields=False ) test_data = data_handler.gen_dataset( [{"text": test_text}], include_label_fields=False ) data_handler.init_feature_metadata(train_data, eval_data, test_data) # Check created vocab meta = data_handler.metadata.features[feat_name] self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"]) if case["expected_num_pretrained_tokens"] == 0: self.assertIsNone(meta.pretrained_embeds_weight) else: self.assertEqual( meta.pretrained_embeds_weight.size(0), case["expected_num_pretrained_tokens"], )
class ModelInputConfig(ModuleConfig): seq_word_feat: WordFeatConfig = WordFeatConfig(min_freq=1, )
class Config(Model.Config, doc_model.DocModel.Config): inputs: Dict[str, Tensorizer.Config] = { "tokens": WordTensorizer.Config(), "labels": LabelTensorizer.Config(), } embedding: WordFeatConfig = WordFeatConfig()