def test_should_parse_top_intents(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - meeting [time:snips/datetime](today)

---
type: intent
name: intent2
utterances:
  - meeting tomorrow
  
---
type: intent
name: intent3
utterances:
  - "[event_type](call) [time:snips/datetime](at 9pm)"

---
type: entity
name: event_type
values:
  - meeting
  - feedback session""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        parser = DeterministicIntentParser().fit(dataset)
        text = "meeting tomorrow"

        # When
        results = parser.parse(text, top_n=3)

        # Then
        time_slot = {
            "entity": "snips/datetime",
            "range": {
                "end": 16,
                "start": 8
            },
            "slotName": "time",
            "value": "tomorrow"
        }
        event_slot = {
            "entity": "event_type",
            "range": {
                "end": 7,
                "start": 0
            },
            "slotName": "event_type",
            "value": "meeting"
        }
        weight_intent_1 = 1. / 2.
        weight_intent_2 = 1.
        weight_intent_3 = 1. / 3.
        total_weight = weight_intent_1 + weight_intent_2 + weight_intent_3
        proba_intent2 = weight_intent_2 / total_weight
        proba_intent1 = weight_intent_1 / total_weight
        proba_intent3 = weight_intent_3 / total_weight
        expected_results = [
            extraction_result(intent_classification_result(
                intent_name="intent2", probability=proba_intent2),
                              slots=[]),
            extraction_result(intent_classification_result(
                intent_name="intent1", probability=proba_intent1),
                              slots=[time_slot]),
            extraction_result(intent_classification_result(
                intent_name="intent3", probability=proba_intent3),
                              slots=[event_slot, time_slot])
        ]
        self.assertEqual(expected_results, results)
    def test_should_be_serializable(self, mock_get_stop_words):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: searchFlight
slots:
  - name: origin
    entity: city
  - name: destination
    entity: city
utterances:
  - find me a flight from [origin](Paris) to [destination](New York)
  - I need a flight to [destination](Berlin)

---
type: entity
name: city
values:
  - london
  - [new york, big apple]
  - [paris, city of lights]
            """)

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        mock_get_stop_words.return_value = {"a", "me"}
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=100,
                                                 ignore_stop_words=True)
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_dict = {
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 100,
                "ignore_stop_words": True
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "group0": "destination",
                "group1": "origin",
            },
            "patterns": {
                "searchFlight": [
                    "^\\s*find\\s*flight\\s*from\\s*(?P<group1>%CITY%)\\s*to"
                    "\\s*(?P<group0>%CITY%)\\s*$",
                    "^\\s*i\\s*need\\s*flight\\s*to\\s*(?P<group0>%CITY%)"
                    "\\s*$",
                ]
            },
            "slot_names_to_entities": {
                "searchFlight": {
                    "destination": "city",
                    "origin": "city",
                }
            },
            "stop_words_whitelist": dict()
        }
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_dict)
Example #3
0
    def test_should_parse_top_intents(self):
        # Given
        text = "foo bar ban"
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - foo [slot1:entity1](bak)
  
---
type: intent
name: intent2
utterances:
  - '[slot2:entity2](foo) baz'
  
---
type: intent
name: intent3
utterances:
  - foo bap""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        # pylint:disable=unused-variable
        @IntentParser.register("first_intent_parser", True)
        class FirstIntentParser(MockIntentParser):
            def get_intents(self, text):
                return [
                    intent_classification_result("intent1", 0.5),
                    intent_classification_result("intent2", 0.3),
                    intent_classification_result(None, 0.15),
                    intent_classification_result("intent3", 0.05)
                ]

            def get_slots(self, text, intent):
                if intent == "intent1":
                    return []
                if intent == "intent2":
                    return [unresolved_slot((0, 3), "foo", "entity2", "slot2")]
                return []

        @IntentParser.register("second_intent_parser", True)
        class SecondIntentParser(MockIntentParser):
            def get_intents(self, text):
                return [
                    intent_classification_result("intent2", 0.6),
                    intent_classification_result("intent1", 0.2),
                    intent_classification_result(None, 0.15),
                    intent_classification_result("intent3", 0.05)
                ]

            def get_slots(self, text, intent):
                if intent == "intent1":
                    return [unresolved_slot((0, 3), "foo", "entity1", "slot1")]
                if intent == "intent2":
                    return [
                        unresolved_slot((8, 11), "ban", "entity2", "slot2")
                    ]
                return []

        # pylint:enable=unused-variable

        config = NLUEngineConfig(
            ["first_intent_parser", "second_intent_parser"])
        nlu_engine = SnipsNLUEngine(config).fit(dataset)

        # When
        results = nlu_engine.parse(text, top_n=3)

        # Then
        expected_results = [
            extraction_result(intent_classification_result("intent2", 0.6), [
                custom_slot(unresolved_slot((0, 3), "foo", "entity2", "slot2"))
            ]),
            extraction_result(intent_classification_result("intent1", 0.5), [
                custom_slot(unresolved_slot((0, 3), "foo", "entity1", "slot1"))
            ]),
            extraction_result(intent_classification_result(None, 0.15), []),
        ]
        self.assertListEqual(expected_results, results)
Example #4
0
    def test_should_serialize_duplicated_intent_parsers(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        # pylint:disable=unused-variable
        @IntentParser.register("my_intent_parser", True)
        class MyIntentParser(MockIntentParser):
            pass

        # pylint:enable=unused-variable

        parsers_configs = ["my_intent_parser", "my_intent_parser"]
        config = NLUEngineConfig(parsers_configs)
        engine = SnipsNLUEngine(config).fit(dataset)

        # When
        engine.persist(self.tmp_file_path)

        # Then
        expected_engine_dict = {
            "unit_name": "nlu_engine",
            "dataset_metadata": {
                "language_code": "en",
                "entities": {
                    "Temperature": {
                        "automatically_extensible": True,
                    }
                },
                "slot_name_mappings": {
                    "MakeCoffee": {
                        "number_of_cups": "snips/number"
                    },
                    "MakeTea": {
                        "beverage_temperature": "Temperature",
                        "number_of_cups": "snips/number"
                    }
                },
            },
            "config": {
                "unit_name":
                "nlu_engine",
                "intent_parsers_configs": [{
                    "unit_name": "my_intent_parser"
                }, {
                    "unit_name": "my_intent_parser"
                }]
            },
            "intent_parsers": ["my_intent_parser", "my_intent_parser_2"],
            "builtin_entity_parser": "builtin_entity_parser",
            "custom_entity_parser": "custom_entity_parser",
            "model_version": snips_nlu.__model_version__,
            "training_package_version": snips_nlu.__version__
        }
        self.assertJsonContent(self.tmp_file_path / "nlu_engine.json",
                               expected_engine_dict)
        self.assertJsonContent(
            self.tmp_file_path / "my_intent_parser" / "metadata.json", {
                "unit_name": "my_intent_parser",
                "fitted": True
            })
        self.assertJsonContent(
            self.tmp_file_path / "my_intent_parser_2" / "metadata.json", {
                "unit_name": "my_intent_parser",
                "fitted": True
            })
Example #5
0
def load_dataset(converted: str) -> JsonDict:
    from snips_nlu.dataset import Dataset

    filenames = glob(os.path.join(converted, "*.yaml"))
    return Dataset.from_yaml_files("en", filenames).json
Example #6
0
    def test_should_be_serializable(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](second_entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(config, **shared)
        intent = "my_intent"
        slot_filler.fit(dataset, intent=intent)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        expected_crf_file = Path(slot_filler.crf_model.modelfile.name).name
        self.assertTrue((self.tmp_file_path / expected_crf_file).exists())

        expected_feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": "en"},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "crf_model_file": expected_crf_file,
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "slot1": "entity1",
                "slot2": "entity2",
            }
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
Example #7
0
    def test_fit_transform_should_be_consistent_with_transform(self):
        # Here we mainly test that the output of fit_transform is
        # the same as the result of fit and then transform.
        # We're trying to avoid that for some reason indexes of features
        # get mixed up after feature selection

        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
        """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        config = FeaturizerConfig(added_cooccurrence_feature_ratio=.5)
        shared = self.get_shared_data(dataset)
        featurizer = Featurizer(config=config, **shared)

        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }]

        classes = [0, 0, 1, 1]

        # When
        x_0 = featurizer.fit_transform(dataset, utterances, classes,
                                       max(classes))
        x_1 = featurizer.transform(utterances)

        # Then
        self.assertListEqual(x_0.todense().tolist(), x_1.todense().tolist())
Example #8
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
            text_to_utterance("Bird birdy"),
        ]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language
        vectorizer.builtin_entity_scope = {"snips/number"}

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {"data": [{"text": "hello world entity_2"}]}

        u_1 = {"data": [{"text": "beauty world ent 1"}]}

        u_2 = {"data": [{"text": "bird bird"}]}

        u_3 = {"data": [{"text": "bird bird"}]}

        ent_0 = {
            "entity_kind": "entity_2",
            "value": "entity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            },
        }

        expected_data = [(u_0, [num_0], [ent_0], []),
                         (u_1, [num_1], [ent_11,
                                         ent_12], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_3, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
Example #9
0
def train(
    sentences_dict: typing.Dict[str, str],
    language: str,
    slots_dict: typing.Optional[typing.Dict[str, typing.List[str]]] = None,
    engine_path: typing.Optional[typing.Union[str, Path]] = None,
    dataset_path: typing.Optional[typing.Union[str, Path]] = None,
) -> SnipsNLUEngine:
    """Generate Snips YAML dataset from Rhasspy sentences/slots."""
    slots_dict = slots_dict or {}

    _LOGGER.debug("Creating Snips engine for language %s", language)
    engine = SnipsNLUEngine(config=DEFAULT_CONFIGS[language])

    # Parse JSGF sentences
    _LOGGER.debug("Parsing sentences")
    with io.StringIO() as ini_file:
        # Join as single ini file
        for lines in sentences_dict.values():
            print(lines, file=ini_file)
            print("", file=ini_file)

        intents = rhasspynlu.parse_ini(ini_file.getvalue())

    # Split into sentences and rule/slot replacements
    sentences, replacements = rhasspynlu.ini_jsgf.split_rules(intents)

    for intent_sentences in sentences.values():
        for sentence in intent_sentences:
            rhasspynlu.jsgf.walk_expression(sentence,
                                            rhasspynlu.number_range_transform,
                                            replacements)

    # Convert to directed graph *without* expanding slots
    # (e.g., $rhasspy/number)
    _LOGGER.debug("Converting to intent graph")
    intent_graph = rhasspynlu.sentences_to_graph(sentences,
                                                 replacements=replacements,
                                                 expand_slots=False)

    # Get start/end nodes for graph
    start_node, end_node = rhasspynlu.jsgf_graph.get_start_end_nodes(
        intent_graph)
    assert (start_node
            is not None) and (end_node
                              is not None), "Missing start/end node(s)"

    if dataset_path:
        # Use user file
        dataset_file = open(dataset_path, "w+")
    else:
        # Use temporary file
        dataset_file = typing.cast(
            typing.TextIO, tempfile.NamedTemporaryFile(suffix=".yml",
                                                       mode="w+"))
        dataset_path = dataset_file.name

    with dataset_file:
        _LOGGER.debug("Writing YAML dataset to %s", dataset_path)

        # Walk first layer of edges with intents
        for _, intent_node, edge_data in intent_graph.edges(start_node,
                                                            data=True):
            intent_name: str = edge_data["olabel"][9:]

            # New intent
            print("---", file=dataset_file)
            print("type: intent", file=dataset_file)
            print("name:", quote(intent_name), file=dataset_file)
            print("utterances:", file=dataset_file)

            # Get all paths through the graph (utterances)
            used_utterances: typing.Set[str] = set()
            paths = nx.all_simple_paths(intent_graph, intent_node, end_node)
            for path in paths:
                utterance = []
                entity_name = None
                slot_name = None
                slot_value = None

                # Walk utterance edges
                for from_node, to_node in rhasspynlu.utils.pairwise(path):
                    edge_data = intent_graph.edges[(from_node, to_node)]
                    ilabel = edge_data.get("ilabel")
                    olabel = edge_data.get("olabel")
                    if olabel:
                        if olabel.startswith("__begin__"):
                            slot_name = olabel[9:]
                            entity_name = None
                            slot_value = ""
                        elif olabel.startswith("__end__"):
                            if entity_name == "rhasspy/number":
                                # Transform to Snips number
                                entity_name = "snips/number"
                            elif not entity_name:
                                # Collect actual value
                                assert (
                                    slot_name and slot_value
                                ), f"No slot name or value (name={slot_name}, value={slot_value})"

                                entity_name = slot_name
                                slot_values = slots_dict.get(slot_name)
                                if not slot_values:
                                    slot_values = []
                                    slots_dict[slot_name] = slot_values

                                slot_values.append(slot_value.strip())

                            # Reference slot/entity (values will be added later)
                            utterance.append(f"[{slot_name}:{entity_name}]")

                            # Reset current slot/entity
                            entity_name = None
                            slot_name = None
                            slot_value = None
                        elif olabel.startswith("__source__"):
                            # Use Rhasspy slot name as entity
                            entity_name = olabel[10:]

                    if ilabel:
                        # Add to current slot/entity value
                        if slot_name and (not entity_name):
                            slot_value += ilabel + " "
                        else:
                            # Add directly to utterance
                            utterance.append(ilabel)
                    elif (olabel and (not olabel.startswith("__"))
                          and slot_name and (not slot_value)
                          and (not entity_name)):
                        slot_value += olabel + " "

                if utterance:
                    utterance_str = " ".join(utterance)
                    if utterance_str not in used_utterances:
                        # Write utterance
                        print("  -", quote(utterance_str), file=dataset_file)
                        used_utterances.add(utterance_str)

            print("", file=dataset_file)

        # Write entities
        for slot_name, values in slots_dict.items():
            if slot_name.startswith("$"):
                # Remove arguments and $
                slot_name = slot_name.split(",")[0][1:]

            # Skip numbers
            if slot_name in {"rhasspy/number"}:
                # Should have been converted already to snips/number
                continue

            # Keep only unique values
            values_set = set(values)

            print("---", file=dataset_file)
            print("type: entity", file=dataset_file)
            print("name:", quote(slot_name), file=dataset_file)
            print("values:", file=dataset_file)

            slot_graph = rhasspynlu.sentences_to_graph({
                slot_name: [
                    rhasspynlu.jsgf.Sentence.parse(value)
                    for value in values_set
                ]
            })

            start_node, end_node = rhasspynlu.jsgf_graph.get_start_end_nodes(
                slot_graph)
            n_data = slot_graph.nodes(data=True)
            for path in nx.all_simple_paths(slot_graph, start_node, end_node):
                words = []
                for node in path:
                    node_data = n_data[node]
                    word = node_data.get("word")
                    if word:
                        words.append(word)

                if words:
                    print("  -", quote(" ".join(words)), file=dataset_file)

            print("", file=dataset_file)

        # ------------
        # Train engine
        # ------------

        if engine_path:
            # Delete existing engine
            engine_path = Path(engine_path)
            engine_path.parent.mkdir(exist_ok=True)

            if engine_path.is_dir():
                # Snips will fail it the directory exists
                _LOGGER.debug("Removing existing engine at %s", engine_path)
                shutil.rmtree(engine_path)
            elif engine_path.is_file():
                _LOGGER.debug("Removing unexpected file at %s", engine_path)
                engine_path.unlink()

        _LOGGER.debug("Training engine")
        dataset_file.seek(0)
        dataset = Dataset.from_yaml_files(language, [dataset_file])
        engine = engine.fit(dataset)

    if engine_path:
        # Save engine
        engine.persist(engine_path)
        _LOGGER.debug("Engine saved to %s", engine_path)

    return engine
Example #10
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
    """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        u_0 = text_to_utterance("hÉllo wOrld Éntity_2")
        u_1 = text_to_utterance("beauTiful World entity 1")
        u_2 = text_to_utterance("Bird bïrdy")
        u_3 = text_to_utterance("Bird birdy")
        utterances = [u_0, u_1, u_2, u_3]

        vectorizer = CooccurrenceVectorizer(
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)

        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        ent_0 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            }
        }

        expected_data = [(u_0, [num_0], [ent_0]),
                         (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []),
                         (u_3, [], [])]

        self.assertSequenceEqual(expected_data, processed_data)
Example #11
0
    def test_entity_match_factory(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)
- this is [entity2](second_entity)""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": True
            },
            "offsets": [0]
        }

        tokens = tokenize("my first entity and second_entity", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        resources = {STEMS: dict()}
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)
        factory = CRFFeatureFactory.from_config(
            config,
            custom_entity_parser=custom_entity_parser,
            resources=resources)
        factory.fit(dataset, "my_intent")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, CustomEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_entity1")
        self.assertEqual(features[1].base_name, "entity_match_entity2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
    def build_snips_data_task1(self):
        """ Build snips data from all brat annotation object 

        :return: Snips Dataset of all brat annotation object
        :rtype: snips_nlu.dataset.Dataset
        """
        import yaml
        import io
        from snips_nlu.dataset import Dataset
        import re
        from sklearn.model_selection import train_test_split

        print("--> Creating snips nlu data training...")
        stream_results = []
        pandas_train = pandas.read_csv(train_task_1.absolute())
        stream_counter, stream_no_counter, utterances = {}, {}, []
        stream_group_ant_conq = {}

        counter_list, no_counter_list, entities = [], [], []
        for i, row in pandas_train.iterrows():
            sent = row['sentence']
            gold = row['gold_label']
            utterances.append(((sent, gold), 1))

        filename_train = source / "snips_semeval_2020_train_task1_cross_{}.yaml".format(
            self.vers)
        filename_test = source / "snips_semeval_2020_test_task1_cross_{}.yaml".format(
            self.vers)

        if self.cross:
            utter_train = [x[0] for x in utterances]
            utter_test = [x[1] for x in utterances]

            train, test, label_train, label_test = train_test_split(
                utter_train, utter_test, test_size=0.2, random_state=42)

            if not Path(filename_train).exists():
                stream_results = self.build_intent_train_task1(train,
                                                               split="train")
                print("--> Writing snips nlu TRAINING data to file...")
                with codecs.open(filename_train, "w", encoding="utf8") as pt:
                    yaml.dump_all(stream_results, pt)

            if not Path(filename_test).exists():
                stream_results = self.build_intent_train_task1(test,
                                                               split="test")
                print("--> Writing snips nlu TESTING data to file...")
                with codecs.open(filename_test, "w", encoding="utf8") as pt:
                    yaml.dump_all(stream_results, pt)

            json_dataset_train, json_dataset_test = [], []
            with codecs.open(filename_train, "r", encoding="utf8") as pt:
                data_counterfact = io.StringIO(pt.read().strip().replace(
                    '', ''))
                json_dataset_train = Dataset.from_yaml_files(
                    self.lang, [data_counterfact]).json
            with codecs.open(filename_test, "r", encoding="utf8") as pt:
                data_counterfact = io.StringIO(pt.read().strip().replace(
                    '', ''))
                json_dataset_test = Dataset.from_yaml_files(
                    self.lang, [data_counterfact]).json

            DATASET_JSON = (json_dataset_train, json_dataset_test)
            return DATASET_JSON
        else:
            utter_train = [x[0] for x in utterances]
            self.vers = "all_" + self.vers
            filename_train = source / "snips_semeval_2020_train_task1_main_{}.yaml".format(
                self.vers)

            if not Path(filename_train).exists():
                stream_results = self.build_intent_train_task1(utter_train)
                print("--> Writing snips nlu TRAINING data to file...")
                with codecs.open(filename_train, "w", encoding="utf8") as pt:
                    yaml.dump_all(stream_results, pt)

            json_dataset_train = []
            with codecs.open(filename_train, "r", encoding="utf8") as pt:
                data_counterfact = io.StringIO(pt.read().strip().replace(
                    '', ''))
                json_dataset_train = Dataset.from_yaml_files(
                    self.lang, [data_counterfact]).json
                return json_dataset_train
Example #13
0
from django.shortcuts import render
from .models import Entity, EntityRecord, EntitySlot, Intent, Utterance, Synonym
from .write_yaml import write_yaml, make_nlu_model_json, make_nlu_model_yaml, write_json
from rest_framework.views import APIView
from rest_framework.response import Response
import json

from snips_nlu import SnipsNLUEngine
from snips_nlu.dataset import Dataset
from snips_nlu.default_configs import CONFIG_EN
from pprint import pprint

# Create your views here.

dataset = Dataset.from_yaml_files("en", ["data.yaml"])
j = dataset.json
m = json.dumps(j)
sample_dataset = json.loads(m)

nlu_engine = SnipsNLUEngine(config=CONFIG_EN)
nlu_engine = nlu_engine.fit(sample_dataset, force_retrain=False)


class MakeYAMLFromDB(APIView):
    def get(self, request):
        global nlu_engine
        d = request.data
        el = []
        entities = list(Entity.objects.all().values(
            'name', 'automatically_extensible', 'use_synonyms',
            'matching_strictness'))
    def test_preprocess_for_training(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": " yo "
            }, {
                "text": " yo "
            }, {
                "text": "yo "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }, {
                "text": " "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }, {
                "text": " "
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird birdy"
            }]
        }]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances, training=True)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {
            "data": [{
                "text": "hello world"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }, {
                "text": ""
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }]
        }
        u_1 = {
            "data": [{
                "text": "beauty world"
            }, {
                "text": "ent 1",
                "entity": "entity_1"
            }, {
                "text": ""
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }
        u_2 = {"data": [{"text": "bird bird"}]}

        ent_00 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 23,
                "end": 31
            }
        }
        ent_01 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 32,
                "end": 40
            }
        }

        ent_1 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "2",
            "range": {
                "start": 25,
                "end": 26
            }
        }

        expected_data = [(u_0, [], [ent_00, ent_01], []),
                         (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_2, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
Example #15
0
    def test_get_entity_scopes(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - meeting [schedule_time:snips/datetime](today)

---
type: intent
name: intent2
utterances:
  - hello world

---
type: intent
name: intent3
utterances:
  - what will be the weather [weather_time:snips/datetime](tomorrow)
  
---
type: intent
name: intent4
utterances:
  - find a flight for [city](Paris) [flight_time:snips/datetime](tomorrow)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        # When
        entity_scopes = _get_entity_scopes(dataset)

        # Then
        expected_scopes = [
            {
                "entity_scope": {
                    "builtin": ["snips/datetime"],
                    "custom": []
                },
                "intent_group": ["intent1", "intent3"]
            },
            {
                "entity_scope": {
                    "builtin": [],
                    "custom": []
                },
                "intent_group": ["intent2"]
            },
            {
                "entity_scope": {
                    "builtin": ["snips/datetime"],
                    "custom": ["city"]
                },
                "intent_group": ["intent4"]
            }
        ]

        def sort_key(group_scope):
            return " ".join(group_scope["intent_group"])

        self.assertListEqual(sorted(expected_scopes, key=sort_key),
                             sorted(entity_scopes, key=sort_key))