Example #1
0
    def test_filter_overlapping_builtins(self):
        # Given
        language = LANGUAGE_EN
        text = "Find me a flight before 10pm and after 8pm"
        tokens = tokenize(text, language)
        tags = ['O' for _ in range(5)] + ['B-flight'] + ['O' for _ in range(3)]
        tagging_scheme = TaggingScheme.BIO
        builtin_entities = [
            {
                RES_MATCH_RANGE: {START: 17, END: 28},
                VALUE: "before 10pm",
                ENTITY_KIND: SNIPS_DATETIME
            },
            {
                RES_MATCH_RANGE: {START: 33, END: 42},
                VALUE: "after 8pm",
                ENTITY_KIND: SNIPS_DATETIME
            }
        ]

        # When
        entities = _filter_overlapping_builtins(builtin_entities, tokens, tags,
                                                tagging_scheme)

        # Then
        expected_entities = [
            {
                RES_MATCH_RANGE: {START: 33, END: 42},
                VALUE: "after 8pm",
                ENTITY_KIND: SNIPS_DATETIME
            }
        ]
        self.assertEqual(entities, expected_entities)
Example #2
0
    def test_feature_should_work_with_cache(self):
        # Given
        def fn(tokens, token_index):
            value = tokens[token_index].value
            return "%s_%s" % (value, len(value))

        mocked_fn = MagicMock(side_effect=fn)

        cache = [{TOKEN_NAME: token} for token in
                 tokenize("hello beautiful world", LANGUAGE_EN)]
        feature = Feature("test_feature", mocked_fn, offset=0)
        feature.compute(2, cache)
        feature1 = Feature("test_feature", mocked_fn, offset=1)
        feature2 = Feature("test_feature", mocked_fn, offset=2)

        # When
        res1 = feature1.compute(1, cache)
        res1_bis = feature1.compute(0, cache)
        res2 = feature2.compute(0, cache)

        # Then
        self.assertEqual(res1, "world_5")
        self.assertEqual(res1_bis, "beautiful_9")
        self.assertEqual(res2, "world_5")
        self.assertEqual(mocked_fn.call_count, 2)
Example #3
0
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)
        slot_filler = CRFSlotFiller(slot_filler_config)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        slot_filler.fit(dataset, intent="dummy_intent_1")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {"ngram_1": "foo"},
            {},
            {"ngram_1": "world"},
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
    def test_ngram_factory_with_gazetteer(self, mock_get_gazetteer):
        # Given
        config = {
            "factory_name": "ngram",
            "args": {
                "n": 2,
                "use_stemming": False,
                "common_words_gazetteer_name": "mocked_gazetteer"
            },
            "offsets": [0]
        }

        mock_get_gazetteer.return_value = {"hello", "beautiful", "world"}
        tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, NgramFactory)
        self.assertEqual(features[0].base_name, "ngram_2")
        self.assertEqual(res, "beautiful rare_word")
Example #5
0
    def test_ngram_factory_with_gazetteer(self, mock_get_gazetteer):
        # Given
        config = {
            "factory_name": "ngram",
            "args": {
                "n": 2,
                "use_stemming": False,
                "common_words_gazetteer_name": "mocked_gazetteer"
            },
            "offsets": [0]
        }

        mock_get_gazetteer.return_value = {"hello", "beautiful", "world"}
        tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, NgramFactory)
        self.assertEqual(features[0].base_name, "ngram_2")
        self.assertEqual(res, "beautiful rare_word")
    def test_feature_should_work_with_cache(self):
        # Given
        def fn(tokens, token_index):
            value = tokens[token_index].value
            return "%s_%s" % (value, len(value))

        mocked_fn = MagicMock(side_effect=fn)

        cache = [{
            TOKEN_NAME: token
        } for token in tokenize("hello beautiful world", LANGUAGE_EN)]
        feature = Feature("test_feature", mocked_fn, offset=0)
        feature.compute(2, cache)
        feature1 = Feature("test_feature", mocked_fn, offset=1)
        feature2 = Feature("test_feature", mocked_fn, offset=2)

        # When
        res1 = feature1.compute(1, cache)
        res1_bis = feature1.compute(0, cache)
        res2 = feature2.compute(0, cache)

        # Then
        self.assertEqual(res1, "world_5")
        self.assertEqual(res1_bis, "beautiful_9")
        self.assertEqual(res2, "world_5")
        self.assertEqual(mocked_fn.call_count, 2)
    def test_single_feature_factory(self):
        # Given
        class TestSingleFeatureFactory(SingleFeatureFactory):
            def compute_feature(self, tokens, token_index):
                value = tokens[token_index].value
                return "%s_%s" % (value, len(value))

        config = {
            "factory_name": "test_factory",
            "args": {},
            "offsets": [0, 1]
        }
        factory = TestSingleFeatureFactory(config)
        factory.fit(None, None)
        features = factory.build_features()
        cache = [{
            TOKEN_NAME: token
        } for token in tokenize("hello beautiful world", LANGUAGE_EN)]

        # When
        res_0 = features[0].compute(0, cache)
        res_1 = features[1].compute(0, cache)

        # Then
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].name, "test_factory")
        self.assertEqual(features[1].name, "test_factory[+1]")
        self.assertEqual(res_0, "hello_5")
        self.assertEqual(res_1, "beautiful_9")
    def get_slots(self, text):
        """Extracts slots from the provided text

        Returns:
            list of dict: The list of extracted slots

        Raises:
            NotTrained: When the slot filler is not fitted
        """
        if not self.fitted:
            raise NotTrained("CRFSlotFiller must be fitted")
        tokens = tokenize(text, self.language)
        if not tokens:
            return []
        features = self.compute_features(tokens)
        tags = [_decode_tag(tag) for tag in
                self.crf_model.predict_single(features)]
        slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
                              self.slot_name_mapping)

        builtin_slots_names = set(slot_name for (slot_name, entity) in
                                  iteritems(self.slot_name_mapping)
                                  if is_builtin_entity(entity))
        if not builtin_slots_names:
            return slots

        # Replace tags corresponding to builtin entities by outside tags
        tags = _replace_builtin_tags(tags, builtin_slots_names)
        return self._augment_slots(text, tokens, tags, builtin_slots_names)
Example #9
0
    def test_single_feature_factory(self):
        # Given
        class TestSingleFeatureFactory(SingleFeatureFactory):
            def compute_feature(self, tokens, token_index):
                value = tokens[token_index].value
                return "%s_%s" % (value, len(value))

        config = {
            "factory_name": "test_factory",
            "args": {},
            "offsets": [0, 1]
        }
        factory = TestSingleFeatureFactory(config)
        factory.fit(None, None)
        features = factory.build_features()
        cache = [{TOKEN_NAME: token} for token in
                 tokenize("hello beautiful world", LANGUAGE_EN)]

        # When
        res_0 = features[0].compute(0, cache)
        res_1 = features[1].compute(0, cache)

        # Then
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].name, "test_factory")
        self.assertEqual(features[1].name, "test_factory[+1]")
        self.assertEqual(res_0, "hello_5")
        self.assertEqual(res_1, "beautiful_9")
Example #10
0
    def get_slots(self, text):
        """Extracts slots from the provided text

        Returns:
            list of dict: The list of extracted slots

        Raises:
            NotTrained: When the slot filler is not fitted
        """
        if not self.fitted:
            raise NotTrained("CRFSlotFiller must be fitted")
        tokens = tokenize(text, self.language)
        if not tokens:
            return []
        features = self.compute_features(tokens)
        tags = [_decode_tag(tag) for tag in
                self.crf_model.predict_single(features)]
        slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
                              self.slot_name_mapping)

        builtin_slots_names = set(slot_name for (slot_name, entity) in
                                  iteritems(self.slot_name_mapping)
                                  if is_builtin_entity(entity))
        if not builtin_slots_names:
            return slots

        # Replace tags corresponding to builtin entities by outside tags
        tags = _replace_builtin_tags(tags, builtin_slots_names)
        return self._augment_slots(text, tokens, tags, builtin_slots_names)
Example #11
0
 def test_space_should_by_ignored(self):
     # Given
     text = " "
     for l in get_all_languages():
         # When
         tokens = tokenize(text, l)
         # Then
         self.assertEqual(len(tokens), 0)
def _deduplicate_overlapping_slots(slots, language):
    deduplicated_slots = []
    for slot in slots:
        is_overlapping = False
        for slot_index, dedup_slot in enumerate(deduplicated_slots):
            if ranges_overlap(slot[RES_MATCH_RANGE],
                              dedup_slot[RES_MATCH_RANGE]):
                is_overlapping = True
                tokens = tokenize(slot[RES_VALUE], language)
                dedup_tokens = tokenize(dedup_slot[RES_VALUE], language)
                if len(tokens) > len(dedup_tokens):
                    deduplicated_slots[slot_index] = slot
                elif len(tokens) == len(dedup_tokens) \
                        and len(slot[RES_VALUE]) > len(dedup_slot[RES_VALUE]):
                    deduplicated_slots[slot_index] = slot
        if not is_overlapping:
            deduplicated_slots.append(slot)
    return deduplicated_slots
def _deduplicate_overlapping_slots(slots, language):
    deduplicated_slots = []
    for slot in slots:
        is_overlapping = False
        for slot_index, dedup_slot in enumerate(deduplicated_slots):
            if ranges_overlap(slot[RES_MATCH_RANGE],
                              dedup_slot[RES_MATCH_RANGE]):
                is_overlapping = True
                tokens = tokenize(slot[RES_VALUE], language)
                dedup_tokens = tokenize(dedup_slot[RES_VALUE], language)
                if len(tokens) > len(dedup_tokens):
                    deduplicated_slots[slot_index] = slot
                elif len(tokens) == len(dedup_tokens) \
                        and len(slot[RES_VALUE]) > len(dedup_slot[RES_VALUE]):
                    deduplicated_slots[slot_index] = slot
        if not is_overlapping:
            deduplicated_slots.append(slot)
    return deduplicated_slots
Example #14
0
    def test_builtin_entity_match_factory(self, mock_supported_entities):
        # Given
        def mocked_supported_entities(language):
            if language == LANGUAGE_EN:
                return {SNIPS_NUMBER, SNIPS_DATETIME}
            return set()

        mock_supported_entities.side_effect = mocked_supported_entities

        config = {
            "factory_name": "builtin_entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
            },
            "offsets": [0]
        }

        tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, BuiltinEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name,
                         "builtin_entity_match_snips/datetime")
        self.assertEqual(features[1].base_name,
                         "builtin_entity_match_snips/number")
        self.assertEqual(res0, UNIT_PREFIX)
        self.assertEqual(res1, None)
        self.assertEqual(res2, BEGINNING_PREFIX)
        self.assertEqual(res3, INSIDE_PREFIX)
        self.assertEqual(res4, LAST_PREFIX)

        self.assertEqual(res5, UNIT_PREFIX)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, None)
Example #15
0
    def test_builtin_entity_match_factory(self, mock_supported_entities):
        # Given
        def mocked_supported_entities(language):
            if language == LANGUAGE_EN:
                return {SNIPS_NUMBER, SNIPS_DATETIME}
            return set()

        mock_supported_entities.side_effect = mocked_supported_entities

        config = {
            "factory_name": "builtin_entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
            },
            "offsets": [0]
        }

        tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, BuiltinEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name,
                         "builtin_entity_match_snips/datetime")
        self.assertEqual(features[1].base_name,
                         "builtin_entity_match_snips/number")
        self.assertEqual(res0, UNIT_PREFIX)
        self.assertEqual(res1, None)
        self.assertEqual(res2, BEGINNING_PREFIX)
        self.assertEqual(res3, INSIDE_PREFIX)
        self.assertEqual(res4, LAST_PREFIX)

        self.assertEqual(res5, UNIT_PREFIX)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, None)
Example #16
0
    def test_should_tokenize_empty_string(self):
        # Given
        language = LANGUAGE_EN
        text = ""

        # When
        tokens = tokenize(text, language)

        # Then
        self.assertListEqual(tokens, [])
Example #17
0
    def test_should_tokenize_only_white_spaces(self):
        # Given
        text = "    "
        language = LANGUAGE_EN

        # When
        tokens = tokenize(text, language)

        # Then
        self.assertListEqual(tokens, [])
Example #18
0
def utterance_to_sample(query_data, tagging_scheme, language):
    tokens, tags = [], []
    current_length = 0
    for chunk in query_data:
        chunk_tokens = tokenize(chunk[TEXT], language)
        tokens += [Token(t.value, current_length + t.start,
                         current_length + t.end) for t in chunk_tokens]
        current_length += len(chunk[TEXT])
        if SLOT_NAME not in chunk:
            tags += negative_tagging(len(chunk_tokens))
        else:
            tags += positive_tagging(tagging_scheme, chunk[SLOT_NAME],
                                     len(chunk_tokens))
    return {TOKENS: tokens, TAGS: tags}
Example #19
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, EntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
Example #20
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, EntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
Example #21
0
    def test_should_tokenize_symbols(self):
        # Given
        language = LANGUAGE_EN
        text = "$$ % !!"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='$$', start=0, end=2, stem=None),
            Token(value='%', start=3, end=4, stem=None),
            Token(value='!!', start=5, end=7, stem=None)
        ]
        self.assertListEqual(tokens, expected_tokens)
Example #22
0
    def test_should_tokenize_literals(self):
        # Given
        language = LANGUAGE_EN
        text = "Hello Beautiful World"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='Hello', start=0, end=5, stem=None),
            Token(value='Beautiful', start=6, end=15, stem=None),
            Token(value='World', start=16, end=21, stem=None)
        ]
        self.assertListEqual(tokens, expected_tokens)
Example #23
0
    def test_feature_should_work_with_offset(self):
        # Given
        def fn(tokens, token_index):
            value = tokens[token_index].value
            return "%s_%s" % (value, len(value))

        cache = [{TOKEN_NAME: token} for token in
                 tokenize("hello beautiful world", LANGUAGE_EN)]
        feature = Feature("test_feature", fn, offset=1)

        # When
        res = feature.compute(1, cache)

        # Then
        self.assertEqual(res, "world_5")
Example #24
0
    def test_length_factory(self):
        # Given
        config = {"factory_name": "length", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, LengthFactory)
        self.assertEqual(features[0].base_name, "length")
        self.assertEqual(res, "5")
Example #25
0
    def test_feature_should_work_with_offset(self):
        # Given
        def fn(tokens, token_index):
            value = tokens[token_index].value
            return "%s_%s" % (value, len(value))

        cache = [{
            TOKEN_NAME: token
        } for token in tokenize("hello beautiful world", LANGUAGE_EN)]
        feature = Feature("test_feature", fn, offset=1)

        # When
        res = feature.compute(1, cache)

        # Then
        self.assertEqual(res, "world_5")
Example #26
0
def _load_gazetteers(gazetteers_path, language):
    if not gazetteers_path.is_dir():
        return dict()

    gazetteers = dict()
    for filepath in gazetteers_path.iterdir():
        gazetteer_name = filepath.stem
        with filepath.open(encoding="utf8") as f:
            gazetteers[gazetteer_name] = set()
            for line in f:
                normalized = normalize(line.strip())
                if normalized:
                    token_values = (t.value
                                    for t in tokenize(normalized, language))
                    normalized = get_default_sep(language).join(token_values)
                    gazetteers[gazetteer_name].add(normalized)
    return gazetteers
Example #27
0
def _load_gazetteers(language):
    gazetteers_paths = {
        os.path.splitext(name)[0]: os.path.join(
            get_resources_path(language), name)
        for name in RESOURCE_INDEX[language].get(GAZETTEERS, [])
    }
    gazetteers = dict()
    for name, path in iteritems(gazetteers_paths):
        with io.open(path, encoding="utf8") as f:
            gazetteers[name] = set()
            for l in f:
                normalized = normalize(l.strip())
                if normalized:
                    normalized = get_ignored_characters_pattern(language).join(
                        [t.value for t in tokenize(normalized, language)])
                    gazetteers[name].add(normalized)
    _RESOURCES[language][GAZETTEERS] = gazetteers
Example #28
0
def _load_gazetteers(language):
    gazetteers_paths = {
        os.path.splitext(name)[0]: os.path.join(get_resources_path(language),
                                                name)
        for name in RESOURCE_INDEX[language].get(GAZETTEERS, [])
    }
    gazetteers = dict()
    for name, path in iteritems(gazetteers_paths):
        with io.open(path, encoding="utf8") as f:
            gazetteers[name] = set()
            for l in f:
                normalized = normalize(l.strip())
                if normalized:
                    normalized = get_ignored_characters_pattern(language).join(
                        [t.value for t in tokenize(normalized, language)])
                    gazetteers[name].add(normalized)
    return gazetteers
Example #29
0
    def test_is_first_factory(self):
        # Given
        config = {"factory_name": "is_first", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res1 = features[0].compute(0, cache)
        res2 = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, IsFirstFactory)
        self.assertEqual(features[0].base_name, "is_first")
        self.assertEqual(res1, "1")
        self.assertEqual(res2, None)
Example #30
0
    def test_length_factory(self):
        # Given
        config = {
            "factory_name": "length",
            "args": {},
            "offsets": [0]
        }
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, LengthFactory)
        self.assertEqual(features[0].base_name, "length")
        self.assertEqual(res, 5)
Example #31
0
    def test_word_cluster_factory(self, mock_get_word_clusters):
        # Given
        def mocked_get_word_clusters(language):
            if language == LANGUAGE_EN:
                return {
                    "mocked_cluster": {
                        "word1": "00",
                        "word2": "11"
                    }
                }
            return dict()

        mock_get_word_clusters.side_effect = mocked_get_word_clusters

        config = {
            "factory_name": "word_cluster",
            "args": {
                "cluster_name": "mocked_cluster",
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("hello word1 word2", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, WordClusterFactory)
        self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster")
        self.assertEqual(res0, None)
        self.assertEqual(res1, "00")
        self.assertEqual(res2, "11")
Example #32
0
    def test_is_first_factory(self):
        # Given
        config = {
            "factory_name": "is_first",
            "args": {},
            "offsets": [0]
        }
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res1 = features[0].compute(0, cache)
        res2 = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, IsFirstFactory)
        self.assertEqual(features[0].base_name, "is_first")
        self.assertEqual(res1, "1")
        self.assertEqual(res2, None)
def _replace_tokenized_out_characters(string, language, replacement_char=" "):
    """Replace all characters that are tokenized out by `replacement_char`

    Examples:
        >>> string = "hello, it's me"
        >>> language = "en"
        >>> tokenize_light(string, language)
        ['hello', 'it', 's', 'me']
        >>> _replace_tokenized_out_characters(string, language, "_")
        'hello__it_s_me'
    """
    tokens = tokenize(string, language)
    current_idx = 0
    cleaned_string = ""
    for token in tokens:
        prefix_length = token.start - current_idx
        cleaned_string += "".join(
            (replacement_char for _ in range(prefix_length)))
        cleaned_string += token.value
        current_idx = token.end
    suffix_length = len(string) - current_idx
    cleaned_string += "".join((replacement_char for _ in range(suffix_length)))
    return cleaned_string
Example #34
0
    def test_word_cluster_factory(self, mock_get_word_clusters):
        # Given
        def mocked_get_word_clusters(language):
            if language == LANGUAGE_EN:
                return {"mocked_cluster": {"word1": "00", "word2": "11"}}
            return dict()

        mock_get_word_clusters.side_effect = mocked_get_word_clusters

        config = {
            "factory_name": "word_cluster",
            "args": {
                "cluster_name": "mocked_cluster",
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("hello word1 word2", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, WordClusterFactory)
        self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster")
        self.assertEqual(res0, None)
        self.assertEqual(res1, "00")
        self.assertEqual(res2, "11")
Example #35
0
    def test_shape_ngram_factory(self):
        # Given
        config = {
            "factory_name": "shape_ngram",
            "args": {
                "n": 3,
            },
            "offsets": [0]
        }

        tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, ShapeNgramFactory)
        self.assertEqual(features[0].base_name, "shape_ngram_3")
        self.assertEqual(res, "Xxx xX xxx")
Example #36
0
    def test_shape_ngram_factory(self):
        # Given
        config = {
            "factory_name": "shape_ngram",
            "args": {
                "n": 3,
            },
            "offsets": [0]
        }

        tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, ShapeNgramFactory)
        self.assertEqual(features[0].base_name, "shape_ngram_3")
        self.assertEqual(res, "Xxx xX xxx")
Example #37
0
    def test_bilou_tags_to_slots(self):
        # Given
        language = LANGUAGE_EN
        slot_name = "animal"
        intent_slots_mapping = {"animal": "animal"}
        tags = [
            {
                "text": "",
                "tags": [],
                "expected_slots": []
            },
            {
                "text": "nothing here",
                "tags": [OUTSIDE, OUTSIDE],
                "expected_slots": []
            },
            {
                "text":
                "i am a blue bird",
                "tags": [
                    OUTSIDE, OUTSIDE, OUTSIDE, BEGINNING_PREFIX + slot_name,
                    LAST_PREFIX + slot_name
                ],
                "expected_slots": [
                    unresolved_slot(match_range=(7, 16),
                                    value="blue bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "i am a bird",
                "tags": [OUTSIDE, OUTSIDE, OUTSIDE, UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(match_range=(7, 11),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "bird",
                "tags": [UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 4),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "blue bird",
                "tags":
                [BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 9),
                                    value="blue bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "light blue bird blue bird",
                "tags": [
                    BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name,
                    LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name,
                    LAST_PREFIX + slot_name
                ],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 15),
                                    value="light blue bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(16, 25),
                                    value="blue bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "bird birdy",
                "tags": [UNIT_PREFIX + slot_name, UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 4),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(5, 10),
                                    value="birdy",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "light bird bird blue bird",
                "tags": [
                    BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name,
                    UNIT_PREFIX + slot_name, BEGINNING_PREFIX + slot_name,
                    INSIDE_PREFIX + slot_name
                ],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 10),
                                    value="light bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(11, 15),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(16, 25),
                                    value="blue bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
            {
                "text":
                "bird bird bird",
                "tags": [
                    LAST_PREFIX + slot_name, BEGINNING_PREFIX + slot_name,
                    UNIT_PREFIX + slot_name
                ],
                "expected_slots": [
                    unresolved_slot(match_range=(0, 4),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(5, 9),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name),
                    unresolved_slot(match_range=(10, 14),
                                    value="bird",
                                    entity=slot_name,
                                    slot_name=slot_name)
                ]
            },
        ]

        for data in tags:
            # When
            slots = tags_to_slots(data["text"],
                                  tokenize(data["text"],
                                           language), data["tags"],
                                  TaggingScheme.BILOU, intent_slots_mapping)
            # Then
            self.assertEqual(slots, data["expected_slots"])
Example #38
0
    def test_bilou_tags_to_slots(self):
        # Given
        language = LANGUAGE_EN
        slot_name = "animal"
        intent_slots_mapping = {"animal": "animal"}
        tags = [
            {
                "text": "",
                "tags": [],
                "expected_slots": []
            },
            {
                "text": "nothing here",
                "tags": [OUTSIDE, OUTSIDE],
                "expected_slots": []
            },
            {
                "text": "i am a blue bird",
                "tags": [OUTSIDE, OUTSIDE, OUTSIDE,
                         BEGINNING_PREFIX + slot_name,
                         LAST_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(7, 16),
                        value="blue bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "i am a bird",
                "tags": [OUTSIDE, OUTSIDE, OUTSIDE,
                         UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(7, 11),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "bird",
                "tags": [UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 4),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "blue bird",
                "tags": [BEGINNING_PREFIX + slot_name,
                         LAST_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 9),
                        value="blue bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "light blue bird blue bird",
                "tags": [BEGINNING_PREFIX + slot_name,
                         INSIDE_PREFIX + slot_name,
                         LAST_PREFIX + slot_name,
                         BEGINNING_PREFIX + slot_name,
                         LAST_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 15),
                        value="light blue bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(16, 25),
                        value="blue bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "bird birdy",
                "tags": [UNIT_PREFIX + slot_name,
                         UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 4),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(5, 10),
                        value="birdy",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "light bird bird blue bird",
                "tags": [BEGINNING_PREFIX + slot_name,
                         INSIDE_PREFIX + slot_name,
                         UNIT_PREFIX + slot_name,
                         BEGINNING_PREFIX + slot_name,
                         INSIDE_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 10),
                        value="light bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(11, 15),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(16, 25),
                        value="blue bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
            {
                "text": "bird bird bird",
                "tags": [LAST_PREFIX + slot_name,
                         BEGINNING_PREFIX + slot_name,
                         UNIT_PREFIX + slot_name],
                "expected_slots": [
                    unresolved_slot(
                        match_range=(0, 4),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(5, 9),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    ),
                    unresolved_slot(
                        match_range=(10, 14),
                        value="bird",
                        entity=slot_name,
                        slot_name=slot_name
                    )
                ]
            },
        ]

        for data in tags:
            # When
            slots = tags_to_slots(
                data["text"], tokenize(data["text"], language),
                data["tags"], TaggingScheme.BILOU,
                intent_slots_mapping)
            # Then
            self.assertEqual(slots, data["expected_slots"])
Example #39
0
    def test_augment_slots(self):
        # Given
        language = LANGUAGE_EN
        text = "Find me a flight before 10pm and after 8pm"
        tokens = tokenize(text, language)
        missing_slots = {"start_date", "end_date"}

        tags = ['O' for _ in tokens]

        def mocked_sequence_probability(_, tags_):
            tags_1 = ['O',
                      'O',
                      'O',
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX,
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX]

            tags_2 = ['O',
                      'O',
                      'O',
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX,
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX]

            tags_3 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

            tags_4 = ['O',
                      'O',
                      'O',
                      'O',
                      'O',
                      'O',
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX]

            tags_5 = ['O',
                      'O',
                      'O',
                      'O',
                      'O',
                      'O',
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX]

            tags_6 = ['O',
                      'O',
                      'O',
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX,
                      'O',
                      'O',
                      'O']

            tags_7 = ['O',
                      'O',
                      'O',
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX,
                      'O',
                      'O',
                      'O']

            tags_8 = ['O',
                      'O',
                      'O',
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX,
                      'O',
                      '%sstart_date' % BEGINNING_PREFIX,
                      '%sstart_date' % INSIDE_PREFIX]

            tags_9 = ['O',
                      'O',
                      'O',
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX,
                      'O',
                      '%send_date' % BEGINNING_PREFIX,
                      '%send_date' % INSIDE_PREFIX]

            if tags_ == tags_1:
                return 0.6
            elif tags_ == tags_2:
                return 0.8
            elif tags_ == tags_3:
                return 0.2
            elif tags_ == tags_4:
                return 0.2
            elif tags_ == tags_5:
                return 0.99
            elif tags_ == tags_6:
                return 0.0
            elif tags_ == tags_7:
                return 0.0
            elif tags_ == tags_8:
                return 0.5
            elif tags_ == tags_9:
                return 0.5
            else:
                raise ValueError("Unexpected tag sequence: %s" % tags_)

        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        slot_filler = CRFSlotFiller(config=slot_filler_config)
        slot_filler.language = LANGUAGE_EN
        slot_filler.intent = "intent1"
        slot_filler.slot_name_mapping = {
            "start_date": "snips/datetime",
            "end_date": "snips/datetime",
        }

        # pylint:disable=protected-access
        slot_filler._get_sequence_probability = MagicMock(
            side_effect=mocked_sequence_probability)
        # pylint:enable=protected-access

        slot_filler.compute_features = MagicMock(return_value=None)

        # When
        # pylint: disable=protected-access
        augmented_slots = slot_filler._augment_slots(text, tokens, tags,
                                                     missing_slots)
        # pylint: enable=protected-access

        # Then
        expected_slots = [
            unresolved_slot(value='after 8pm',
                            match_range={START: 33, END: 42},
                            entity='snips/datetime', slot_name='end_date')
        ]
        self.assertListEqual(augmented_slots, expected_slots)