def test_should_compute_tokenization_shift(self): # Given text = " hello? world" tokens = tokenize(text, "en") # When shifts = _compute_char_shifts(tokens) # Then expected_shifts = [-2, -2, -2, -2, -2, -1, -1, -3, -3, -3, -3, -3, -3] self.assertListEqual(expected_shifts, shifts)
def _parse(self, text, scope): tokens = tokenize(text, self.language) shifts = _compute_char_shifts(tokens) cleaned_text = " ".join(token.value for token in tokens) entities = self._parser.parse(cleaned_text, scope) for entity in entities: start = entity[RES_MATCH_RANGE][START] end = entity[RES_MATCH_RANGE][END] entity[ENTITY_KIND] = entity.pop("entity_identifier") entity[RES_MATCH_RANGE][START] -= shifts[start] entity[RES_MATCH_RANGE][END] -= shifts[end - 1] return entities
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("2 dummy a had dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features( custom_entity_parser=custom_entity_parser) features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_custom_multi_feature_factory(self): # Given # pylint:disable=unused-variable @CRFFeatureFactory.register("my_multi_feature_factory", override=True) class MyMultiFeature(CRFFeatureFactory): def build_features(self): first_features = [ Feature("my_first_feature", self.compute_feature_1, offset=offset) for offset in self.offsets ] second_features = [ Feature("my_second_feature", self.compute_feature_2, offset=offset) for offset in self.offsets ] return first_features + second_features @staticmethod def compute_feature_1(tokens, token_index): return "(%s)[my_feature_1]" % tokens[token_index].value @staticmethod def compute_feature_2(tokens, token_index): return "(%s)[my_feature_2]" % tokens[token_index].value # pylint:enable=unused-variable # When config = { "factory_name": "my_multi_feature_factory", "args": {}, "offsets": [-1, 0] } feature_factory = CRFFeatureFactory.from_config(config) features = feature_factory.build_features() feature_0 = features[0] feature_1 = features[1] feature_2 = features[2] feature_3 = features[3] tokens = tokenize("foo bar baz", "en") cache = [{TOKEN_NAME: token} for token in tokens] # Then self.assertEqual("my_first_feature[-1]", feature_0.name) self.assertEqual("(foo)[my_feature_1]", feature_0.compute(1, cache)) self.assertEqual("my_first_feature", feature_1.name) self.assertEqual("my_second_feature[-1]", feature_2.name) self.assertEqual("(bar)[my_feature_2]", feature_2.compute(2, cache)) self.assertEqual("my_second_feature", feature_3.name)
def get_slots(self, text): tokens = tokenize(text, self.language) slots = [] for token in tokens: normalized_value = token.value if self.config.get("lowercase", False): normalized_value = normalized_value.lower() if normalized_value in self.slots_keywords: entity = self.slots_keywords[normalized_value][0] slot_name = self.slots_keywords[normalized_value][1] slot = unresolved_slot((token.start, token.end), token.value, entity, slot_name) slots.append(slot) return slots
def test_should_tokenize_literals(self): # Given language = LANGUAGE_EN text = "Hello Beautiful World" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='Hello', start=0, end=5), Token(value='Beautiful', start=6, end=15), Token(value='World', start=16, end=21) ] self.assertListEqual(tokens, expected_tokens)
def test_should_tokenize_symbols(self): # Given language = LANGUAGE_EN text = "$$ % !!" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='$$', start=0, end=2), Token(value='%', start=3, end=4), Token(value='!!', start=5, end=7) ] self.assertListEqual(tokens, expected_tokens)
def test_length_factory(self): # Given config = {"factory_name": "length", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = CRFFeatureFactory.from_config(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(2, cache) # Then self.assertIsInstance(factory, LengthFactory) self.assertEqual(features[0].base_name, "length") self.assertEqual(res, "5")
def test_feature_should_work_with_offset(self): # Given def fn(tokens, token_index): value = tokens[token_index].value return "%s_%s" % (value, len(value)) cache = [{ TOKEN_NAME: token } for token in tokenize("hello beautiful world", LANGUAGE_EN)] feature = Feature("test_feature", fn, offset=1) # When res = feature.compute(1, cache) # Then self.assertEqual(res, "world_5")
def _preprocess_text(self, string): """Replaces stop words and characters that are tokenized out by whitespaces""" tokens = tokenize(string, self.language) current_idx = 0 cleaned_string = "" for token in tokens: if self.stop_words and normalize_token(token) in self.stop_words: token.value = "".join(" " for _ in range(len(token.value))) prefix_length = token.start - current_idx cleaned_string += "".join((" " for _ in range(prefix_length))) cleaned_string += token.value current_idx = token.end suffix_length = len(string) - current_idx cleaned_string += "".join((" " for _ in range(suffix_length))) return cleaned_string
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset, CustomEntityParserUsage.WITHOUT_STEMS) slot_filler = CRFSlotFiller(slot_filler_config, **shared) slot_filler.fit(dataset, intent="my_intent") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ { "ngram_1": "foo" }, {}, { "ngram_1": "world" }, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def test_is_first_factory(self): # Given config = {"factory_name": "is_first", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = CRFFeatureFactory.from_config(config) factory.fit(None, None) features = factory.build_features() # When res1 = features[0].compute(0, cache) res2 = features[0].compute(1, cache) # Then self.assertIsInstance(factory, IsFirstFactory) self.assertEqual(features[0].base_name, "is_first") self.assertEqual(res1, "1") self.assertEqual(res2, None)
def _parse(self, text, scope=None): tokens = tokenize(text, self.language) shifts = _compute_char_shifts(tokens) cleaned_text = " ".join(token.value for token in tokens) entities = self._parser.parse(cleaned_text, scope) result = [] for entity in entities: start = entity["range"]["start"] start -= shifts[start] end = entity["range"]["end"] end -= shifts[end - 1] entity_range = {START: start, END: end} ent = parsed_entity(entity_kind=entity["entity_identifier"], entity_value=entity["value"], entity_resolved_value=entity["resolved_value"], entity_range=entity_range) result.append(ent) return result
def get_slots(self, text): """Extracts slots from the provided text Returns: list of dict: The list of extracted slots Raises: NotTrained: When the slot filler is not fitted """ if not self.slot_name_mapping: # Early return if the intent has no slots return [] tokens = tokenize(text, self.language) if not tokens: return [] features = self.compute_features(tokens) tags = [_decode_tag(tag) for tag in self.crf_model.predict_single(features)] return tags_to_slots(text, tokens, tags, self.config.tagging_scheme, self.slot_name_mapping)
def test_suffix_factory(self): # Given config = { "factory_name": "suffix", "args": { "suffix_size": 2 }, "offsets": [0] } tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, SuffixFactory) self.assertEqual(features[0].base_name, "suffix_2") self.assertEqual(res, "ul")
def test_word_cluster_factory(self): # Given resources = { WORD_CLUSTERS: { "my_word_clusters": { "word1": "00", "word2": "11" } } } config = { "factory_name": "word_cluster", "args": { "cluster_name": "my_word_clusters", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = CRFFeatureFactory.from_config(config, resources=resources) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_my_word_clusters") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_filter_overlapping_builtins(self): # Given language = LANGUAGE_EN text = "Find me a flight before 10pm and after 8pm" tokens = tokenize(text, language) tags = ['O' for _ in range(5)] + ['B-flight'] + ['O' for _ in range(3)] tagging_scheme = TaggingScheme.BIO builtin_entities = [{ RES_MATCH_RANGE: { START: 17, END: 28 }, VALUE: "before 10pm", ENTITY_KIND: SNIPS_DATETIME }, { RES_MATCH_RANGE: { START: 33, END: 42 }, VALUE: "after 8pm", ENTITY_KIND: SNIPS_DATETIME }] # When entities = _filter_overlapping_builtins(builtin_entities, tokens, tags, tagging_scheme) # Then expected_entities = [{ RES_MATCH_RANGE: { START: 33, END: 42 }, VALUE: "after 8pm", ENTITY_KIND: SNIPS_DATETIME }] self.assertEqual(entities, expected_entities)
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) slot_filler = CRFSlotFiller(slot_filler_config) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler.fit(dataset, intent="dummy_intent_1") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ { "ngram_1": "foo" }, {}, { "ngram_1": "world" }, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def _replace_tokenized_out_characters(string, language, replacement_char=" "): """Replace all characters that are tokenized out by `replacement_char` Examples: >>> string = "hello, it's me" >>> language = "en" >>> tokenize_light(string, language) ['hello', 'it', 's', 'me'] >>> _replace_tokenized_out_characters(string, language, "_") 'hello__it_s_me' """ tokens = tokenize(string, language) current_idx = 0 cleaned_string = "" for token in tokens: prefix_length = token.start - current_idx cleaned_string += "".join( (replacement_char for _ in range(prefix_length))) cleaned_string += token.value current_idx = token.end suffix_length = len(string) - current_idx cleaned_string += "".join((replacement_char for _ in range(suffix_length))) return cleaned_string
def test_word_cluster_factory(self, mock_get_word_clusters): # Given def mocked_get_word_clusters(language): if language == LANGUAGE_EN: return {"mocked_cluster": {"word1": "00", "word2": "11"}} return dict() mock_get_word_clusters.side_effect = mocked_get_word_clusters config = { "factory_name": "word_cluster", "args": { "cluster_name": "mocked_cluster", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_shape_ngram_factory(self): # Given config = { "factory_name": "shape_ngram", "args": { "n": 3, }, "offsets": [0] } tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = CRFFeatureFactory.from_config(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, ShapeNgramFactory) self.assertEqual(features[0].base_name, "shape_ngram_3") self.assertEqual(res, "Xxx xX xxx")
def sort_key_fn(slot): tokens = tokenize(slot[RES_VALUE], language) return -(len(tokens) + len(slot[RES_VALUE]))
def test_augment_slots(self): # Given language = LANGUAGE_EN text = "Find me a flight before 10pm and after 8pm" tokens = tokenize(text, language) missing_slots = {"start_date", "end_date"} tags = ['O' for _ in tokens] def mocked_sequence_probability(_, tags_): tags_1 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] tags_2 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_3 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] tags_4 = [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_5 = [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] tags_6 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', 'O', 'O' ] tags_7 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', 'O', 'O' ] tags_8 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_9 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] if tags_ == tags_1: return 0.6 elif tags_ == tags_2: return 0.8 elif tags_ == tags_3: return 0.2 elif tags_ == tags_4: return 0.2 elif tags_ == tags_5: return 0.99 elif tags_ == tags_6: return 0.0 elif tags_ == tags_7: return 0.0 elif tags_ == tags_8: return 0.5 elif tags_ == tags_9: return 0.5 else: raise ValueError("Unexpected tag sequence: %s" % tags_) slot_filler_config = CRFSlotFillerConfig(random_seed=42) slot_filler = CRFSlotFiller(config=slot_filler_config) slot_filler.language = LANGUAGE_EN slot_filler.intent = "intent1" slot_filler.slot_name_mapping = { "start_date": "snips/datetime", "end_date": "snips/datetime", } # pylint:disable=protected-access slot_filler._get_sequence_probability = MagicMock( side_effect=mocked_sequence_probability) # pylint:enable=protected-access slot_filler.compute_features = MagicMock(return_value=None) # When # pylint: disable=protected-access augmented_slots = slot_filler._augment_slots(text, tokens, tags, missing_slots) # pylint: enable=protected-access # Then expected_slots = [ unresolved_slot(value='after 8pm', match_range={ START: 33, END: 42 }, entity='snips/datetime', slot_name='end_date') ] self.assertListEqual(augmented_slots, expected_slots)
def test_bilou_tags_to_slots(self): # Given language = LANGUAGE_EN slot_name = "animal" intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", "tags": [], "expected_slots": [] }, { "text": "nothing here", "tags": [OUTSIDE, OUTSIDE], "expected_slots": [] }, { "text": "i am a blue bird", "tags": [ OUTSIDE, OUTSIDE, OUTSIDE, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(7, 16), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "i am a bird", "tags": [OUTSIDE, OUTSIDE, OUTSIDE, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(7, 11), value="bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird", "tags": [UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "blue bird", "tags": [BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 9), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "light blue bird blue bird", "tags": [ BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 15), value="light blue bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird birdy", "tags": [UNIT_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(5, 10), value="birdy", entity=slot_name, slot_name=slot_name) ] }, { "text": "light bird bird blue bird", "tags": [ BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name, UNIT_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 10), value="light bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(11, 15), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(16, 25), value="blue bird", entity=slot_name, slot_name=slot_name) ] }, { "text": "bird bird bird", "tags": [ LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name, UNIT_PREFIX + slot_name ], "expected_slots": [ unresolved_slot(match_range=(0, 4), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(5, 9), value="bird", entity=slot_name, slot_name=slot_name), unresolved_slot(match_range=(10, 14), value="bird", entity=slot_name, slot_name=slot_name) ] }, ] for data in tags: # When slots = tags_to_slots(data["text"], tokenize(data["text"], language), data["tags"], TaggingScheme.BILOU, intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"])
def test_entity_match_factory(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [entity1](my first entity) - this is [entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("my first entity and second_entity", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] resources = {STEMS: dict()} custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) factory = CRFFeatureFactory.from_config( config, custom_entity_parser=custom_entity_parser, resources=resources) factory.fit(dataset, "my_intent") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_entity1") self.assertEqual(features[1].base_name, "entity_match_entity2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)