def test_should_parse_with_proper_tokenization(self): # Given parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict()) text = " dummy_1?dummy_2" # When result = parser.parse(text) result = sorted(result, key=lambda e: e["range"]["start"]) # Then expected_entities = [{ "value": "dummy_1", "resolved_value": "dummy_entity_1", "range": { "start": 2, "end": 9 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_2", "resolved_value": "dummy_entity_2", "range": { "start": 10, "end": 17 }, "entity_kind": "dummy_entity_2" }] self.assertListEqual(expected_entities, result)
def test_should_be_serializable(self): # Given parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict()) self.tmp_file_path.mkdir() parser_path = self.tmp_file_path / "custom_entity_parser" parser.persist(parser_path) loaded_parser = CustomEntityParser.from_path(parser_path) # When scope = ["dummy_entity_1"] text = "dummy_entity_1 dummy_1" result = loaded_parser.parse(text, scope=scope) # Then expected_entities = [{ "value": "dummy_entity_1", "resolved_value": "dummy_entity_1", "range": { "start": 0, "end": 14 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", "resolved_value": "dummy_entity_1", "range": { "start": 15, "end": 22 }, "entity_kind": "dummy_entity_1" }] self.assertListEqual(expected_entities, result)
def test_should_parse_with_stems(self): # Given resources = { STEMS: { "dummy_entity_1": "dummy_entity_", "dummy_1": "dummy_" } } parser = CustomEntityParser.build(DATASET, CustomEntityParserUsage.WITH_STEMS, resources) text = "dummy_entity_ dummy_1" scope = ["dummy_entity_1"] # When result = parser.parse(text, scope=scope) # Then expected_entities = [{ "value": "dummy_entity_", "resolved_value": "dummy_entity_1", "range": { "start": 0, "end": 13 }, "entity_kind": "dummy_entity_1" }] self.assertListEqual(expected_entities, result)
def test_should_parse_with_and_without_stems(self, mocked_stem): # Given mocked_stem.side_effect = _stem parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS) scope = ["dummy_entity_1"] text = "dummy_entity_ dummy_1" # When result = parser.parse(text, scope=scope) # Then expected_entities = [ { "value": "dummy_entity_", "resolved_value": "dummy_entity_1", "range": { "start": 0, "end": 13 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", "resolved_value": "dummy_entity_1", "range": { "start": 14, "end": 21 }, "entity_kind": "dummy_entity_1" } ] self.assertListEqual(expected_entities, result)
def test_should_not_build_custom_parser_when_provided(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json resources = load_resources("en") custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS, resources) # When with patch("snips_nlu.entity_parser.custom_entity_parser" ".CustomEntityParser.build") as mocked_build_parser: engine = SnipsNLUEngine( custom_entity_parser=custom_entity_parser) engine.fit(dataset) # Then mocked_build_parser.assert_not_called()
def test_should_respect_scope(self): # Given parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict()) scope = ["dummy_entity_1"] text = "dummy_entity_2" # When result = parser.parse(text, scope=scope) # Then self.assertListEqual([], result)
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("2 dummy a had dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features( custom_entity_parser=custom_entity_parser) features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_should_use_cache(self, mocked_parse): # Given mocked_parse.return_value = [] parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict()) text = "" # When parser.parse(text) parser.parse(text) # Then self.assertEqual(1, mocked_parse.call_count)
def test_should_parse_without_stems(self): # Given parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS) text = "dummy_entity_1 dummy_1 dummy_entity_2 dummy_2" # When result = parser.parse(text) result = sorted(result, key=lambda e: e["range"]["start"]) # Then expected_entities = [ { "value": "dummy_entity_1", "resolved_value": "dummy_entity_1", "range": { "start": 0, "end": 14 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", "resolved_value": "dummy_entity_1", "range": { "start": 15, "end": 22 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_entity_2", "resolved_value": "dummy_entity_2", "range": { "start": 23, "end": 37 }, "entity_kind": "dummy_entity_2" }, { "value": "dummy_2", "resolved_value": "dummy_entity_2", "range": { "start": 38, "end": 45 }, "entity_kind": "dummy_entity_2" } ] self.assertListEqual(expected_entities, result)
def get_shared_data(cls, dataset, parser_usage=None): from snips_nlu.entity_parser import (BuiltinEntityParser, CustomEntityParser, CustomEntityParserUsage) if parser_usage is None: parser_usage = CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS resources = cls.get_resources(dataset["language"]) builtin_entity_parser = BuiltinEntityParser.build(dataset) custom_entity_parser = CustomEntityParser.build( dataset, parser_usage, resources) return { "resources": resources, "builtin_entity_parser": builtin_entity_parser, "custom_entity_parser": custom_entity_parser }
def test_should_be_serializable_into_bytearray(self): # Given dataset = BEVERAGE_DATASET engine = SnipsNLUEngine().fit(dataset) # When engine_bytes = engine.to_byte_array() builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) custom_entity_parser = CustomEntityParser.build( dataset, parser_usage=CustomEntityParserUsage.WITHOUT_STEMS) loaded_engine = SnipsNLUEngine.from_byte_array( engine_bytes, builtin_entity_parser=builtin_entity_parser, custom_entity_parser=custom_entity_parser) result = loaded_engine.parse("Make me two cups of coffee") # Then self.assertEqual(result[RES_INTENT][RES_INTENT_NAME], "MakeCoffee")
def fit_custom_entity_parser_if_needed(self, dataset): # We only fit a custom entity parser when the unit has already been # fitted or if the parser is none. # In the other cases the parser is provided fitted by another unit. required_resources = self.config.get_required_resources() if not required_resources or not required_resources.get( CUSTOM_ENTITY_PARSER_USAGE): # In these cases we need a custom entity parser only to do the # final slot resolution step, which must be done without stemming. parser_usage = CustomEntityParserUsage.WITHOUT_STEMS else: parser_usage = required_resources[CUSTOM_ENTITY_PARSER_USAGE] if self.custom_entity_parser is None or self.fitted: self.custom_entity_parser = CustomEntityParser.build( dataset, parser_usage) return self
def test_should_get_intent_after_deserialization(self): # Given dataset = validate_and_format_dataset(BEVERAGE_DATASET) classifier = LogRegIntentClassifier().fit(dataset) classifier.persist(self.tmp_file_path) # When builtin_entity_parser = BuiltinEntityParser.build(language="en") custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS) loaded_classifier = LogRegIntentClassifier.from_path( self.tmp_file_path, builtin_entity_parser=builtin_entity_parser, custom_entity_parser=custom_entity_parser) result = loaded_classifier.get_intent("Make me two cups of tea") # Then expected_intent = "MakeTea" self.assertEqual(expected_intent, result[RES_INTENT_NAME])
def test_should_be_serializable_into_bytearray(self): # Given dataset = validate_and_format_dataset(BEVERAGE_DATASET) intent_classifier = LogRegIntentClassifier().fit(dataset) # When intent_classifier_bytes = intent_classifier.to_byte_array() custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS) builtin_entity_parser = BuiltinEntityParser.build(language="en") loaded_classifier = LogRegIntentClassifier.from_byte_array( intent_classifier_bytes, builtin_entity_parser=builtin_entity_parser, custom_entity_parser=custom_entity_parser) result = loaded_classifier.get_intent("make me two cups of tea") # Then expected_intent = "MakeTea" self.assertEqual(expected_intent, result[RES_INTENT_NAME])
def test_should_be_serializable(self): # Given parser = CustomEntityParser.build( DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict()) self.tmp_file_path.mkdir() parser_path = self.tmp_file_path / "custom_entity_parser" parser.persist(parser_path) loaded_parser = CustomEntityParser.from_path(parser_path) # When scope = ["dummy_entity_1"] text = "dummy_entity_1 dummy_1" result = loaded_parser.parse(text, scope=scope) # Then expected_entities = [{ "value": "dummy_entity_1", "resolved_value": "dummy_entity_1", "range": { "start": 0, "end": 14 }, "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", "resolved_value": "dummy_entity_1", "range": { "start": 15, "end": 22 }, "entity_kind": "dummy_entity_1" }] self.assertListEqual(expected_entities, result) license_path = parser_path / "parser" / "parser_1" / "LICENSE" self.assertTrue(license_path.exists()) with license_path.open(encoding="utf8") as f: license_content = f.read() self.assertEqual("some license content here", license_content)
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), text_to_utterance("Bird birdy"), ] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language vectorizer.builtin_entity_scope = {"snips/number"} # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then u_0 = {"data": [{"text": "hello world entity_2"}]} u_1 = {"data": [{"text": "beauty world ent 1"}]} u_2 = {"data": [{"text": "bird bird"}]} u_3 = {"data": [{"text": "bird bird"}]} ent_0 = { "entity_kind": "entity_2", "value": "entity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } ent_12 = { "entity_kind": "entity_2", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" }, } expected_data = [(u_0, [num_0], [ent_0], []), (u_1, [num_1], [ent_11, ent_12], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_3, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) u_0 = text_to_utterance("hÉllo wOrld Éntity_2") u_1 = text_to_utterance("beauTiful World entity 1") u_2 = text_to_utterance("Bird bïrdy") u_3 = text_to_utterance("Bird birdy") utterances = [u_0, u_1, u_2, u_3] vectorizer = CooccurrenceVectorizer( custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then ent_0 = { "entity_kind": "entity_2", "value": "Éntity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } ent_12 = { "entity_kind": "entity_2", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" } } expected_data = [(u_0, [num_0], [ent_0]), (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []), (u_3, [], [])] self.assertSequenceEqual(expected_data, processed_data)
def test_preprocess_for_training(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": " yo " }, { "text": " yo " }, { "text": "yo " }, { "text": "Éntity_2", "entity": "entity_2" }, { "text": " " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }, { "text": " " }, { "text": "2", "entity": "snips/number" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird birdy" }] }] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances, training=True) processed_data = list(zip(*processed_data)) # Then u_0 = { "data": [{ "text": "hello world" }, { "text": "yo" }, { "text": "yo" }, { "text": "yo" }, { "text": "entity_2", "entity": "entity_2" }, { "text": "" }, { "text": "entity_2", "entity": "entity_2" }] } u_1 = { "data": [{ "text": "beauty world" }, { "text": "ent 1", "entity": "entity_1" }, { "text": "" }, { "text": "2", "entity": "snips/number" }] } u_2 = {"data": [{"text": "bird bird"}]} ent_00 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 23, "end": 31 } } ent_01 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 32, "end": 40 } } ent_1 = { "entity_kind": "entity_1", "value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "2", "range": { "start": 25, "end": 26 } } expected_data = [(u_0, [], [ent_00, ent_01], []), (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_2, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def test_entity_match_factory(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [entity1](my first entity) - this is [entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("my first entity and second_entity", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] resources = {STEMS: dict()} custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) factory = CRFFeatureFactory.from_config( config, custom_entity_parser=custom_entity_parser, resources=resources) factory.fit(dataset, "my_intent") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_entity1") self.assertEqual(features[1].base_name, "entity_match_entity2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_preprocess_utterances(self, mocked_parser_stem, mocked_featurizer_stem, mocked_word_cluster): # Given language = LANGUAGE_EN def _stem(t): t = normalize(t) if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)]) mocked_word_cluster.return_value = { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } mocked_parser_stem.side_effect = stem_function mocked_featurizer_stem.side_effect = stem_function dataset = { "intents": { "intent1": { "utterances": [] } }, "entities": { "entity_1": { "data": [{ "value": "entity 1", "synonyms": ["alternative entity 1"] }, { "value": "éntity 1", "synonyms": ["alternative entity 1"] }], "use_synonyms": False, "automatically_extensible": False, "matching_strictness": 1.0 }, "entity_2": { "data": [{ "value": "entity 1", "synonyms": [] }, { "value": "Éntity 2", "synonyms": ["Éntity_2", "Alternative entity 2"] }], "use_synonyms": True, "automatically_extensible": False, "matching_strictness": 1.0 }, "snips/number": {} }, "language": "en", } dataset = validate_and_format_dataset(dataset) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), ] labeled_utterance = { DATA: [{ TEXT: "beauTiful éntity " }, { TEXT: "1", ENTITY: "snips/number", SLOT_NAME: "number" }, { TEXT: " bIrd Éntity_2" }] } utterances.append(labeled_utterance) labels = np.array([0, 0, 1, 1]) custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS) featurizer = Featurizer(language, None, custom_entity_parser=custom_entity_parser, config=FeaturizerConfig( word_clusters_name="brown_clusters", use_stemming=True)).fit( dataset, utterances, labels) # When utterances = featurizer.preprocess_utterances(utterances) # Then expected_utterances = [ "hello world entity_2 builtinentityfeaturesnipsnumber " "entityfeatureentity_2", "beauty world ent 1 builtinentityfeaturesnipsnumber " "entityfeatureentity_1 entityfeatureentity_2 " "cluster_1 cluster_3", "bird bird", "beauty ent bird entity_2 builtinentityfeaturesnipsnumber " "builtinentityfeaturesnipsnumber entityfeatureentity_1 " "entityfeatureentity_2 entityfeatureentity_2 cluster_1" ] self.assertListEqual(utterances, expected_utterances)