def test_filter_overlapping_builtins(self): # Given language = LANGUAGE_EN text = "Find me a flight before 10pm and after 8pm" tokens = tokenize(text, language) tags = ['O' for _ in range(5)] + ['B-flight'] + ['O' for _ in range(3)] tagging_scheme = TaggingScheme.BIO builtin_entities = [ { RES_MATCH_RANGE: {START: 17, END: 28}, VALUE: "before 10pm", ENTITY_KIND: SNIPS_DATETIME }, { RES_MATCH_RANGE: {START: 33, END: 42}, VALUE: "after 8pm", ENTITY_KIND: SNIPS_DATETIME } ] # When entities = _filter_overlapping_builtins(builtin_entities, tokens, tags, tagging_scheme) # Then expected_entities = [ { RES_MATCH_RANGE: {START: 33, END: 42}, VALUE: "after 8pm", ENTITY_KIND: SNIPS_DATETIME } ] self.assertEqual(entities, expected_entities)
def test_feature_should_work_with_cache(self): # Given def fn(tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) mocked_fn = MagicMock(side_effect=fn) cache = [{TOKEN_NAME: token} for token in tokenize("hello beautiful world", LANGUAGE_EN)] feature = Feature("test_feature", mocked_fn, offset=0) feature.compute(2, cache) feature1 = Feature("test_feature", mocked_fn, offset=1) feature2 = Feature("test_feature", mocked_fn, offset=2) # When res1 = feature1.compute(1, cache) res1_bis = feature1.compute(0, cache) res2 = feature2.compute(0, cache) # Then self.assertEqual(res1, "world_5") self.assertEqual(res1_bis, "beautiful_9") self.assertEqual(res2, "world_5") self.assertEqual(mocked_fn.call_count, 2)
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) slot_filler = CRFSlotFiller(slot_filler_config) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler.fit(dataset, intent="dummy_intent_1") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ {"ngram_1": "foo"}, {}, {"ngram_1": "world"}, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def test_ngram_factory_with_gazetteer(self, mock_get_gazetteer): # Given config = { "factory_name": "ngram", "args": { "n": 2, "use_stemming": False, "common_words_gazetteer_name": "mocked_gazetteer" }, "offsets": [0] } mock_get_gazetteer.return_value = {"hello", "beautiful", "world"} tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, NgramFactory) self.assertEqual(features[0].base_name, "ngram_2") self.assertEqual(res, "beautiful rare_word")
def test_feature_should_work_with_cache(self): # Given def fn(tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) mocked_fn = MagicMock(side_effect=fn) cache = [{ TOKEN_NAME: token } for token in tokenize("hello beautiful world", LANGUAGE_EN)] feature = Feature("test_feature", mocked_fn, offset=0) feature.compute(2, cache) feature1 = Feature("test_feature", mocked_fn, offset=1) feature2 = Feature("test_feature", mocked_fn, offset=2) # When res1 = feature1.compute(1, cache) res1_bis = feature1.compute(0, cache) res2 = feature2.compute(0, cache) # Then self.assertEqual(res1, "world_5") self.assertEqual(res1_bis, "beautiful_9") self.assertEqual(res2, "world_5") self.assertEqual(mocked_fn.call_count, 2)
def test_single_feature_factory(self): # Given class TestSingleFeatureFactory(SingleFeatureFactory): def compute_feature(self, tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) config = { "factory_name": "test_factory", "args": {}, "offsets": [0, 1] } factory = TestSingleFeatureFactory(config) factory.fit(None, None) features = factory.build_features() cache = [{ TOKEN_NAME: token } for token in tokenize("hello beautiful world", LANGUAGE_EN)] # When res_0 = features[0].compute(0, cache) res_1 = features[1].compute(0, cache) # Then self.assertEqual(len(features), 2) self.assertEqual(features[0].name, "test_factory") self.assertEqual(features[1].name, "test_factory[+1]") self.assertEqual(res_0, "hello_5") self.assertEqual(res_1, "beautiful_9")
def get_slots(self, text): """Extracts slots from the provided text Returns: list of dict: The list of extracted slots Raises: NotTrained: When the slot filler is not fitted """ if not self.fitted: raise NotTrained("CRFSlotFiller must be fitted") tokens = tokenize(text, self.language) if not tokens: return [] features = self.compute_features(tokens) tags = [_decode_tag(tag) for tag in self.crf_model.predict_single(features)] slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme, self.slot_name_mapping) builtin_slots_names = set(slot_name for (slot_name, entity) in iteritems(self.slot_name_mapping) if is_builtin_entity(entity)) if not builtin_slots_names: return slots # Replace tags corresponding to builtin entities by outside tags tags = _replace_builtin_tags(tags, builtin_slots_names) return self._augment_slots(text, tokens, tags, builtin_slots_names)
def test_single_feature_factory(self): # Given class TestSingleFeatureFactory(SingleFeatureFactory): def compute_feature(self, tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) config = { "factory_name": "test_factory", "args": {}, "offsets": [0, 1] } factory = TestSingleFeatureFactory(config) factory.fit(None, None) features = factory.build_features() cache = [{TOKEN_NAME: token} for token in tokenize("hello beautiful world", LANGUAGE_EN)] # When res_0 = features[0].compute(0, cache) res_1 = features[1].compute(0, cache) # Then self.assertEqual(len(features), 2) self.assertEqual(features[0].name, "test_factory") self.assertEqual(features[1].name, "test_factory[+1]") self.assertEqual(res_0, "hello_5") self.assertEqual(res_1, "beautiful_9")
def test_space_should_by_ignored(self): # Given text = " " for l in get_all_languages(): # When tokens = tokenize(text, l) # Then self.assertEqual(len(tokens), 0)
def _deduplicate_overlapping_slots(slots, language): deduplicated_slots = [] for slot in slots: is_overlapping = False for slot_index, dedup_slot in enumerate(deduplicated_slots): if ranges_overlap(slot[RES_MATCH_RANGE], dedup_slot[RES_MATCH_RANGE]): is_overlapping = True tokens = tokenize(slot[RES_VALUE], language) dedup_tokens = tokenize(dedup_slot[RES_VALUE], language) if len(tokens) > len(dedup_tokens): deduplicated_slots[slot_index] = slot elif len(tokens) == len(dedup_tokens) \ and len(slot[RES_VALUE]) > len(dedup_slot[RES_VALUE]): deduplicated_slots[slot_index] = slot if not is_overlapping: deduplicated_slots.append(slot) return deduplicated_slots
def test_builtin_entity_match_factory(self, mock_supported_entities): # Given def mocked_supported_entities(language): if language == LANGUAGE_EN: return {SNIPS_NUMBER, SNIPS_DATETIME} return set() mock_supported_entities.side_effect = mocked_supported_entities config = { "factory_name": "builtin_entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, }, "offsets": [0] } tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, BuiltinEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "builtin_entity_match_snips/datetime") self.assertEqual(features[1].base_name, "builtin_entity_match_snips/number") self.assertEqual(res0, UNIT_PREFIX) self.assertEqual(res1, None) self.assertEqual(res2, BEGINNING_PREFIX) self.assertEqual(res3, INSIDE_PREFIX) self.assertEqual(res4, LAST_PREFIX) self.assertEqual(res5, UNIT_PREFIX) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, None)
def test_should_tokenize_empty_string(self): # Given language = LANGUAGE_EN text = "" # When tokens = tokenize(text, language) # Then self.assertListEqual(tokens, [])
def test_should_tokenize_only_white_spaces(self): # Given text = " " language = LANGUAGE_EN # When tokens = tokenize(text, language) # Then self.assertListEqual(tokens, [])
def utterance_to_sample(query_data, tagging_scheme, language): tokens, tags = [], [] current_length = 0 for chunk in query_data: chunk_tokens = tokenize(chunk[TEXT], language) tokens += [Token(t.value, current_length + t.start, current_length + t.end) for t in chunk_tokens] current_length += len(chunk[TEXT]) if SLOT_NAME not in chunk: tags += negative_tagging(len(chunk_tokens)) else: tags += positive_tagging(tagging_scheme, chunk[SLOT_NAME], len(chunk_tokens)) return {TOKENS: tokens, TAGS: tags}
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": False }, "offsets": [0] } tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, EntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_should_tokenize_symbols(self): # Given language = LANGUAGE_EN text = "$$ % !!" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='$$', start=0, end=2, stem=None), Token(value='%', start=3, end=4, stem=None), Token(value='!!', start=5, end=7, stem=None) ] self.assertListEqual(tokens, expected_tokens)
def test_should_tokenize_literals(self): # Given language = LANGUAGE_EN text = "Hello Beautiful World" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='Hello', start=0, end=5, stem=None), Token(value='Beautiful', start=6, end=15, stem=None), Token(value='World', start=16, end=21, stem=None) ] self.assertListEqual(tokens, expected_tokens)
def test_feature_should_work_with_offset(self): # Given def fn(tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) cache = [{TOKEN_NAME: token} for token in tokenize("hello beautiful world", LANGUAGE_EN)] feature = Feature("test_feature", fn, offset=1) # When res = feature.compute(1, cache) # Then self.assertEqual(res, "world_5")
def test_length_factory(self): # Given config = {"factory_name": "length", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(2, cache) # Then self.assertIsInstance(factory, LengthFactory) self.assertEqual(features[0].base_name, "length") self.assertEqual(res, "5")
def test_feature_should_work_with_offset(self): # Given def fn(tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) cache = [{ TOKEN_NAME: token } for token in tokenize("hello beautiful world", LANGUAGE_EN)] feature = Feature("test_feature", fn, offset=1) # When res = feature.compute(1, cache) # Then self.assertEqual(res, "world_5")
def _load_gazetteers(gazetteers_path, language): if not gazetteers_path.is_dir(): return dict() gazetteers = dict() for filepath in gazetteers_path.iterdir(): gazetteer_name = filepath.stem with filepath.open(encoding="utf8") as f: gazetteers[gazetteer_name] = set() for line in f: normalized = normalize(line.strip()) if normalized: token_values = (t.value for t in tokenize(normalized, language)) normalized = get_default_sep(language).join(token_values) gazetteers[gazetteer_name].add(normalized) return gazetteers
def _load_gazetteers(language): gazetteers_paths = { os.path.splitext(name)[0]: os.path.join( get_resources_path(language), name) for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) } gazetteers = dict() for name, path in iteritems(gazetteers_paths): with io.open(path, encoding="utf8") as f: gazetteers[name] = set() for l in f: normalized = normalize(l.strip()) if normalized: normalized = get_ignored_characters_pattern(language).join( [t.value for t in tokenize(normalized, language)]) gazetteers[name].add(normalized) _RESOURCES[language][GAZETTEERS] = gazetteers
def _load_gazetteers(language): gazetteers_paths = { os.path.splitext(name)[0]: os.path.join(get_resources_path(language), name) for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) } gazetteers = dict() for name, path in iteritems(gazetteers_paths): with io.open(path, encoding="utf8") as f: gazetteers[name] = set() for l in f: normalized = normalize(l.strip()) if normalized: normalized = get_ignored_characters_pattern(language).join( [t.value for t in tokenize(normalized, language)]) gazetteers[name].add(normalized) return gazetteers
def test_is_first_factory(self): # Given config = {"factory_name": "is_first", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res1 = features[0].compute(0, cache) res2 = features[0].compute(1, cache) # Then self.assertIsInstance(factory, IsFirstFactory) self.assertEqual(features[0].base_name, "is_first") self.assertEqual(res1, "1") self.assertEqual(res2, None)
def test_length_factory(self): # Given config = { "factory_name": "length", "args": {}, "offsets": [0] } tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(2, cache) # Then self.assertIsInstance(factory, LengthFactory) self.assertEqual(features[0].base_name, "length") self.assertEqual(res, 5)
def test_word_cluster_factory(self, mock_get_word_clusters): # Given def mocked_get_word_clusters(language): if language == LANGUAGE_EN: return { "mocked_cluster": { "word1": "00", "word2": "11" } } return dict() mock_get_word_clusters.side_effect = mocked_get_word_clusters config = { "factory_name": "word_cluster", "args": { "cluster_name": "mocked_cluster", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_is_first_factory(self): # Given config = { "factory_name": "is_first", "args": {}, "offsets": [0] } tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res1 = features[0].compute(0, cache) res2 = features[0].compute(1, cache) # Then self.assertIsInstance(factory, IsFirstFactory) self.assertEqual(features[0].base_name, "is_first") self.assertEqual(res1, "1") self.assertEqual(res2, None)
def _replace_tokenized_out_characters(string, language, replacement_char=" "): """Replace all characters that are tokenized out by `replacement_char` Examples: >>> string = "hello, it's me" >>> language = "en" >>> tokenize_light(string, language) ['hello', 'it', 's', 'me'] >>> _replace_tokenized_out_characters(string, language, "_") 'hello__it_s_me' """ tokens = tokenize(string, language) current_idx = 0 cleaned_string = "" for token in tokens: prefix_length = token.start - current_idx cleaned_string += "".join( (replacement_char for _ in range(prefix_length))) cleaned_string += token.value current_idx = token.end suffix_length = len(string) - current_idx cleaned_string += "".join((replacement_char for _ in range(suffix_length))) return cleaned_string
def test_word_cluster_factory(self, mock_get_word_clusters): # Given def mocked_get_word_clusters(language): if language == LANGUAGE_EN: return {"mocked_cluster": {"word1": "00", "word2": "11"}} return dict() mock_get_word_clusters.side_effect = mocked_get_word_clusters config = { "factory_name": "word_cluster", "args": { "cluster_name": "mocked_cluster", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_shape_ngram_factory(self): # Given config = { "factory_name": "shape_ngram", "args": { "n": 3, }, "offsets": [0] } tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, ShapeNgramFactory) self.assertEqual(features[0].base_name, "shape_ngram_3") self.assertEqual(res, "Xxx xX xxx")
def test_bilou_tags_to_slots(self): # Given language = LANGUAGE_EN slot_name = "animal" intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", "tags": [], "expected_slots": [] }, { "text": "nothing here", "tags": [OUTSIDE, OUTSIDE], "expected_slots": [] }, { "text": "i am a blue bird", "tags": [ OUTSIDE, OUTSIDE, OUTSIDE, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(7, 16), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "i am a bird", "tags": [OUTSIDE, OUTSIDE, OUTSIDE, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(7, 11), value="bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird", "tags": [UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "blue bird", "tags": [BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 9), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "light blue bird blue bird", "tags": [ BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 15), value="light blue bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird birdy", "tags": [UNIT_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(5, 10), value="birdy", entity=slot_name, slot_name=slot_name) ] }, { "text": "light bird bird blue bird", "tags": [ BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, UNIT_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 10), value="light bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(11, 15), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird bird bird", "tags": [ LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, UNIT_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(5, 9), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(10, 14), value="bird", entity=slot_name, slot_name=slot_name) ] }, ] for data in tags: # When slots = tags_to_slots(data["text"], tokenize(data["text"], language), data["tags"], TaggingScheme.BILOU, intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"])
def test_bilou_tags_to_slots(self): # Given language = LANGUAGE_EN slot_name = "animal" intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", "tags": [], "expected_slots": [] }, { "text": "nothing here", "tags": [OUTSIDE, OUTSIDE], "expected_slots": [] }, { "text": "i am a blue bird", "tags": [OUTSIDE, OUTSIDE, OUTSIDE, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(7, 16), value="blue bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "i am a bird", "tags": [OUTSIDE, OUTSIDE, OUTSIDE, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(7, 11), value="bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "bird", "tags": [UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "blue bird", "tags": [BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 9), value="blue bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "light blue bird blue bird", "tags": [BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 15), value="light blue bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "bird birdy", "tags": [UNIT_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(5, 10), value="birdy", entity=slot_name, slot_name=slot_name ) ] }, { "text": "light bird bird blue bird", "tags": [BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, UNIT_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 10), value="light bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(11, 15), value="bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name ) ] }, { "text": "bird bird bird", "tags": [LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot( match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(5, 9), value="bird", entity=slot_name, slot_name=slot_name ), unresolved_slot( match_range=(10, 14), value="bird", entity=slot_name, slot_name=slot_name ) ] }, ] for data in tags: # When slots = tags_to_slots( data["text"], tokenize(data["text"], language), data["tags"], TaggingScheme.BILOU, intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"])
def test_augment_slots(self): # Given language = LANGUAGE_EN text = "Find me a flight before 10pm and after 8pm" tokens = tokenize(text, language) missing_slots = {"start_date", "end_date"} tags = ['O' for _ in tokens] def mocked_sequence_probability(_, tags_): tags_1 = ['O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX] tags_2 = ['O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX] tags_3 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] tags_4 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX] tags_5 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX] tags_6 = ['O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', 'O', 'O'] tags_7 = ['O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', 'O', 'O'] tags_8 = ['O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX] tags_9 = ['O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX] if tags_ == tags_1: return 0.6 elif tags_ == tags_2: return 0.8 elif tags_ == tags_3: return 0.2 elif tags_ == tags_4: return 0.2 elif tags_ == tags_5: return 0.99 elif tags_ == tags_6: return 0.0 elif tags_ == tags_7: return 0.0 elif tags_ == tags_8: return 0.5 elif tags_ == tags_9: return 0.5 else: raise ValueError("Unexpected tag sequence: %s" % tags_) slot_filler_config = CRFSlotFillerConfig(random_seed=42) slot_filler = CRFSlotFiller(config=slot_filler_config) slot_filler.language = LANGUAGE_EN slot_filler.intent = "intent1" slot_filler.slot_name_mapping = { "start_date": "snips/datetime", "end_date": "snips/datetime", } # pylint:disable=protected-access slot_filler._get_sequence_probability = MagicMock( side_effect=mocked_sequence_probability) # pylint:enable=protected-access slot_filler.compute_features = MagicMock(return_value=None) # When # pylint: disable=protected-access augmented_slots = slot_filler._augment_slots(text, tokens, tags, missing_slots) # pylint: enable=protected-access # Then expected_slots = [ unresolved_slot(value='after 8pm', match_range={START: 33, END: 42}, entity='snips/datetime', slot_name='end_date') ] self.assertListEqual(augmented_slots, expected_slots)