def test_should_parse_top_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - meeting [time:snips/datetime](today) --- type: intent name: intent2 utterances: - meeting tomorrow --- type: intent name: intent3 utterances: - "[event_type](call) [time:snips/datetime](at 9pm)" --- type: entity name: event_type values: - meeting - feedback session""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "meeting tomorrow" # When results = parser.parse(text, top_n=3) # Then time_slot = { "entity": "snips/datetime", "range": { "end": 16, "start": 8 }, "slotName": "time", "value": "tomorrow" } event_slot = { "entity": "event_type", "range": { "end": 7, "start": 0 }, "slotName": "event_type", "value": "meeting" } weight_intent_1 = 1. / 2. weight_intent_2 = 1. weight_intent_3 = 1. / 3. total_weight = weight_intent_1 + weight_intent_2 + weight_intent_3 proba_intent2 = weight_intent_2 / total_weight proba_intent1 = weight_intent_1 / total_weight proba_intent3 = weight_intent_3 / total_weight expected_results = [ extraction_result(intent_classification_result( intent_name="intent2", probability=proba_intent2), slots=[]), extraction_result(intent_classification_result( intent_name="intent1", probability=proba_intent1), slots=[time_slot]), extraction_result(intent_classification_result( intent_name="intent3", probability=proba_intent3), slots=[event_slot, time_slot]) ] self.assertEqual(expected_results, results)
def test_should_be_serializable(self, mock_get_stop_words): # Given dataset_stream = io.StringIO(""" --- type: intent name: searchFlight slots: - name: origin entity: city - name: destination entity: city utterances: - find me a flight from [origin](Paris) to [destination](New York) - I need a flight to [destination](Berlin) --- type: entity name: city values: - london - [new york, big apple] - [paris, city of lights] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json mock_get_stop_words.return_value = {"a", "me"} config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=100, ignore_stop_words=True) parser = DeterministicIntentParser(config=config).fit(dataset) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 100, "ignore_stop_words": True }, "language_code": "en", "group_names_to_slot_names": { "group0": "destination", "group1": "origin", }, "patterns": { "searchFlight": [ "^\\s*find\\s*flight\\s*from\\s*(?P<group1>%CITY%)\\s*to" "\\s*(?P<group0>%CITY%)\\s*$", "^\\s*i\\s*need\\s*flight\\s*to\\s*(?P<group0>%CITY%)" "\\s*$", ] }, "slot_names_to_entities": { "searchFlight": { "destination": "city", "origin": "city", } }, "stop_words_whitelist": dict() } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_parse_top_intents(self): # Given text = "foo bar ban" dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - foo [slot1:entity1](bak) --- type: intent name: intent2 utterances: - '[slot2:entity2](foo) baz' --- type: intent name: intent3 utterances: - foo bap""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # pylint:disable=unused-variable @IntentParser.register("first_intent_parser", True) class FirstIntentParser(MockIntentParser): def get_intents(self, text): return [ intent_classification_result("intent1", 0.5), intent_classification_result("intent2", 0.3), intent_classification_result(None, 0.15), intent_classification_result("intent3", 0.05) ] def get_slots(self, text, intent): if intent == "intent1": return [] if intent == "intent2": return [unresolved_slot((0, 3), "foo", "entity2", "slot2")] return [] @IntentParser.register("second_intent_parser", True) class SecondIntentParser(MockIntentParser): def get_intents(self, text): return [ intent_classification_result("intent2", 0.6), intent_classification_result("intent1", 0.2), intent_classification_result(None, 0.15), intent_classification_result("intent3", 0.05) ] def get_slots(self, text, intent): if intent == "intent1": return [unresolved_slot((0, 3), "foo", "entity1", "slot1")] if intent == "intent2": return [ unresolved_slot((8, 11), "ban", "entity2", "slot2") ] return [] # pylint:enable=unused-variable config = NLUEngineConfig( ["first_intent_parser", "second_intent_parser"]) nlu_engine = SnipsNLUEngine(config).fit(dataset) # When results = nlu_engine.parse(text, top_n=3) # Then expected_results = [ extraction_result(intent_classification_result("intent2", 0.6), [ custom_slot(unresolved_slot((0, 3), "foo", "entity2", "slot2")) ]), extraction_result(intent_classification_result("intent1", 0.5), [ custom_slot(unresolved_slot((0, 3), "foo", "entity1", "slot1")) ]), extraction_result(intent_classification_result(None, 0.15), []), ] self.assertListEqual(expected_results, results)
def test_should_serialize_duplicated_intent_parsers(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # pylint:disable=unused-variable @IntentParser.register("my_intent_parser", True) class MyIntentParser(MockIntentParser): pass # pylint:enable=unused-variable parsers_configs = ["my_intent_parser", "my_intent_parser"] config = NLUEngineConfig(parsers_configs) engine = SnipsNLUEngine(config).fit(dataset) # When engine.persist(self.tmp_file_path) # Then expected_engine_dict = { "unit_name": "nlu_engine", "dataset_metadata": { "language_code": "en", "entities": { "Temperature": { "automatically_extensible": True, } }, "slot_name_mappings": { "MakeCoffee": { "number_of_cups": "snips/number" }, "MakeTea": { "beverage_temperature": "Temperature", "number_of_cups": "snips/number" } }, }, "config": { "unit_name": "nlu_engine", "intent_parsers_configs": [{ "unit_name": "my_intent_parser" }, { "unit_name": "my_intent_parser" }] }, "intent_parsers": ["my_intent_parser", "my_intent_parser_2"], "builtin_entity_parser": "builtin_entity_parser", "custom_entity_parser": "custom_entity_parser", "model_version": snips_nlu.__model_version__, "training_package_version": snips_nlu.__version__ } self.assertJsonContent(self.tmp_file_path / "nlu_engine.json", expected_engine_dict) self.assertJsonContent( self.tmp_file_path / "my_intent_parser" / "metadata.json", { "unit_name": "my_intent_parser", "fitted": True }) self.assertJsonContent( self.tmp_file_path / "my_intent_parser_2" / "metadata.json", { "unit_name": "my_intent_parser", "fitted": True })
def load_dataset(converted: str) -> JsonDict: from snips_nlu.dataset import Dataset filenames = glob(os.path.join(converted, "*.yaml")) return Dataset.from_yaml_files("en", filenames).json
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) shared = self.get_shared_data(dataset) slot_filler = CRFSlotFiller(config, **shared) intent = "my_intent" slot_filler.fit(dataset, intent=intent) # When slot_filler.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"}) expected_crf_file = Path(slot_filler.crf_model.modelfile.name).name self.assertTrue((self.tmp_file_path / expected_crf_file).exists()) expected_feature_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1, "language_code": "en"}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=expected_feature_factories) expected_slot_filler_dict = { "crf_model_file": expected_crf_file, "language_code": "en", "config": expected_config.to_dict(), "intent": intent, "slot_name_mapping": { "slot1": "entity1", "slot2": "entity2", } } slot_filler_path = self.tmp_file_path / "slot_filler.json" self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
def test_fit_transform_should_be_consistent_with_transform(self): # Here we mainly test that the output of fit_transform is # the same as the result of fit and then transform. # We're trying to avoid that for some reason indexes of features # get mixed up after feature selection # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = FeaturizerConfig(added_cooccurrence_feature_ratio=.5) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird bïrdy" }] }] classes = [0, 0, 1, 1] # When x_0 = featurizer.fit_transform(dataset, utterances, classes, max(classes)) x_1 = featurizer.transform(utterances) # Then self.assertListEqual(x_0.todense().tolist(), x_1.todense().tolist())
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), text_to_utterance("Bird birdy"), ] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language vectorizer.builtin_entity_scope = {"snips/number"} # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then u_0 = {"data": [{"text": "hello world entity_2"}]} u_1 = {"data": [{"text": "beauty world ent 1"}]} u_2 = {"data": [{"text": "bird bird"}]} u_3 = {"data": [{"text": "bird bird"}]} ent_0 = { "entity_kind": "entity_2", "value": "entity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } ent_12 = { "entity_kind": "entity_2", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" }, } expected_data = [(u_0, [num_0], [ent_0], []), (u_1, [num_1], [ent_11, ent_12], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_3, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def train( sentences_dict: typing.Dict[str, str], language: str, slots_dict: typing.Optional[typing.Dict[str, typing.List[str]]] = None, engine_path: typing.Optional[typing.Union[str, Path]] = None, dataset_path: typing.Optional[typing.Union[str, Path]] = None, ) -> SnipsNLUEngine: """Generate Snips YAML dataset from Rhasspy sentences/slots.""" slots_dict = slots_dict or {} _LOGGER.debug("Creating Snips engine for language %s", language) engine = SnipsNLUEngine(config=DEFAULT_CONFIGS[language]) # Parse JSGF sentences _LOGGER.debug("Parsing sentences") with io.StringIO() as ini_file: # Join as single ini file for lines in sentences_dict.values(): print(lines, file=ini_file) print("", file=ini_file) intents = rhasspynlu.parse_ini(ini_file.getvalue()) # Split into sentences and rule/slot replacements sentences, replacements = rhasspynlu.ini_jsgf.split_rules(intents) for intent_sentences in sentences.values(): for sentence in intent_sentences: rhasspynlu.jsgf.walk_expression(sentence, rhasspynlu.number_range_transform, replacements) # Convert to directed graph *without* expanding slots # (e.g., $rhasspy/number) _LOGGER.debug("Converting to intent graph") intent_graph = rhasspynlu.sentences_to_graph(sentences, replacements=replacements, expand_slots=False) # Get start/end nodes for graph start_node, end_node = rhasspynlu.jsgf_graph.get_start_end_nodes( intent_graph) assert (start_node is not None) and (end_node is not None), "Missing start/end node(s)" if dataset_path: # Use user file dataset_file = open(dataset_path, "w+") else: # Use temporary file dataset_file = typing.cast( typing.TextIO, tempfile.NamedTemporaryFile(suffix=".yml", mode="w+")) dataset_path = dataset_file.name with dataset_file: _LOGGER.debug("Writing YAML dataset to %s", dataset_path) # Walk first layer of edges with intents for _, intent_node, edge_data in intent_graph.edges(start_node, data=True): intent_name: str = edge_data["olabel"][9:] # New intent print("---", file=dataset_file) print("type: intent", file=dataset_file) print("name:", quote(intent_name), file=dataset_file) print("utterances:", file=dataset_file) # Get all paths through the graph (utterances) used_utterances: typing.Set[str] = set() paths = nx.all_simple_paths(intent_graph, intent_node, end_node) for path in paths: utterance = [] entity_name = None slot_name = None slot_value = None # Walk utterance edges for from_node, to_node in rhasspynlu.utils.pairwise(path): edge_data = intent_graph.edges[(from_node, to_node)] ilabel = edge_data.get("ilabel") olabel = edge_data.get("olabel") if olabel: if olabel.startswith("__begin__"): slot_name = olabel[9:] entity_name = None slot_value = "" elif olabel.startswith("__end__"): if entity_name == "rhasspy/number": # Transform to Snips number entity_name = "snips/number" elif not entity_name: # Collect actual value assert ( slot_name and slot_value ), f"No slot name or value (name={slot_name}, value={slot_value})" entity_name = slot_name slot_values = slots_dict.get(slot_name) if not slot_values: slot_values = [] slots_dict[slot_name] = slot_values slot_values.append(slot_value.strip()) # Reference slot/entity (values will be added later) utterance.append(f"[{slot_name}:{entity_name}]") # Reset current slot/entity entity_name = None slot_name = None slot_value = None elif olabel.startswith("__source__"): # Use Rhasspy slot name as entity entity_name = olabel[10:] if ilabel: # Add to current slot/entity value if slot_name and (not entity_name): slot_value += ilabel + " " else: # Add directly to utterance utterance.append(ilabel) elif (olabel and (not olabel.startswith("__")) and slot_name and (not slot_value) and (not entity_name)): slot_value += olabel + " " if utterance: utterance_str = " ".join(utterance) if utterance_str not in used_utterances: # Write utterance print(" -", quote(utterance_str), file=dataset_file) used_utterances.add(utterance_str) print("", file=dataset_file) # Write entities for slot_name, values in slots_dict.items(): if slot_name.startswith("$"): # Remove arguments and $ slot_name = slot_name.split(",")[0][1:] # Skip numbers if slot_name in {"rhasspy/number"}: # Should have been converted already to snips/number continue # Keep only unique values values_set = set(values) print("---", file=dataset_file) print("type: entity", file=dataset_file) print("name:", quote(slot_name), file=dataset_file) print("values:", file=dataset_file) slot_graph = rhasspynlu.sentences_to_graph({ slot_name: [ rhasspynlu.jsgf.Sentence.parse(value) for value in values_set ] }) start_node, end_node = rhasspynlu.jsgf_graph.get_start_end_nodes( slot_graph) n_data = slot_graph.nodes(data=True) for path in nx.all_simple_paths(slot_graph, start_node, end_node): words = [] for node in path: node_data = n_data[node] word = node_data.get("word") if word: words.append(word) if words: print(" -", quote(" ".join(words)), file=dataset_file) print("", file=dataset_file) # ------------ # Train engine # ------------ if engine_path: # Delete existing engine engine_path = Path(engine_path) engine_path.parent.mkdir(exist_ok=True) if engine_path.is_dir(): # Snips will fail it the directory exists _LOGGER.debug("Removing existing engine at %s", engine_path) shutil.rmtree(engine_path) elif engine_path.is_file(): _LOGGER.debug("Removing unexpected file at %s", engine_path) engine_path.unlink() _LOGGER.debug("Training engine") dataset_file.seek(0) dataset = Dataset.from_yaml_files(language, [dataset_file]) engine = engine.fit(dataset) if engine_path: # Save engine engine.persist(engine_path) _LOGGER.debug("Engine saved to %s", engine_path) return engine
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) u_0 = text_to_utterance("hÉllo wOrld Éntity_2") u_1 = text_to_utterance("beauTiful World entity 1") u_2 = text_to_utterance("Bird bïrdy") u_3 = text_to_utterance("Bird birdy") utterances = [u_0, u_1, u_2, u_3] vectorizer = CooccurrenceVectorizer( custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then ent_0 = { "entity_kind": "entity_2", "value": "Éntity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } ent_12 = { "entity_kind": "entity_2", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" } } expected_data = [(u_0, [num_0], [ent_0]), (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []), (u_3, [], [])] self.assertSequenceEqual(expected_data, processed_data)
def test_entity_match_factory(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [entity1](my first entity) - this is [entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("my first entity and second_entity", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] resources = {STEMS: dict()} custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) factory = CRFFeatureFactory.from_config( config, custom_entity_parser=custom_entity_parser, resources=resources) factory.fit(dataset, "my_intent") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_entity1") self.assertEqual(features[1].base_name, "entity_match_entity2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def build_snips_data_task1(self): """ Build snips data from all brat annotation object :return: Snips Dataset of all brat annotation object :rtype: snips_nlu.dataset.Dataset """ import yaml import io from snips_nlu.dataset import Dataset import re from sklearn.model_selection import train_test_split print("--> Creating snips nlu data training...") stream_results = [] pandas_train = pandas.read_csv(train_task_1.absolute()) stream_counter, stream_no_counter, utterances = {}, {}, [] stream_group_ant_conq = {} counter_list, no_counter_list, entities = [], [], [] for i, row in pandas_train.iterrows(): sent = row['sentence'] gold = row['gold_label'] utterances.append(((sent, gold), 1)) filename_train = source / "snips_semeval_2020_train_task1_cross_{}.yaml".format( self.vers) filename_test = source / "snips_semeval_2020_test_task1_cross_{}.yaml".format( self.vers) if self.cross: utter_train = [x[0] for x in utterances] utter_test = [x[1] for x in utterances] train, test, label_train, label_test = train_test_split( utter_train, utter_test, test_size=0.2, random_state=42) if not Path(filename_train).exists(): stream_results = self.build_intent_train_task1(train, split="train") print("--> Writing snips nlu TRAINING data to file...") with codecs.open(filename_train, "w", encoding="utf8") as pt: yaml.dump_all(stream_results, pt) if not Path(filename_test).exists(): stream_results = self.build_intent_train_task1(test, split="test") print("--> Writing snips nlu TESTING data to file...") with codecs.open(filename_test, "w", encoding="utf8") as pt: yaml.dump_all(stream_results, pt) json_dataset_train, json_dataset_test = [], [] with codecs.open(filename_train, "r", encoding="utf8") as pt: data_counterfact = io.StringIO(pt.read().strip().replace( '', '')) json_dataset_train = Dataset.from_yaml_files( self.lang, [data_counterfact]).json with codecs.open(filename_test, "r", encoding="utf8") as pt: data_counterfact = io.StringIO(pt.read().strip().replace( '', '')) json_dataset_test = Dataset.from_yaml_files( self.lang, [data_counterfact]).json DATASET_JSON = (json_dataset_train, json_dataset_test) return DATASET_JSON else: utter_train = [x[0] for x in utterances] self.vers = "all_" + self.vers filename_train = source / "snips_semeval_2020_train_task1_main_{}.yaml".format( self.vers) if not Path(filename_train).exists(): stream_results = self.build_intent_train_task1(utter_train) print("--> Writing snips nlu TRAINING data to file...") with codecs.open(filename_train, "w", encoding="utf8") as pt: yaml.dump_all(stream_results, pt) json_dataset_train = [] with codecs.open(filename_train, "r", encoding="utf8") as pt: data_counterfact = io.StringIO(pt.read().strip().replace( '', '')) json_dataset_train = Dataset.from_yaml_files( self.lang, [data_counterfact]).json return json_dataset_train
from django.shortcuts import render from .models import Entity, EntityRecord, EntitySlot, Intent, Utterance, Synonym from .write_yaml import write_yaml, make_nlu_model_json, make_nlu_model_yaml, write_json from rest_framework.views import APIView from rest_framework.response import Response import json from snips_nlu import SnipsNLUEngine from snips_nlu.dataset import Dataset from snips_nlu.default_configs import CONFIG_EN from pprint import pprint # Create your views here. dataset = Dataset.from_yaml_files("en", ["data.yaml"]) j = dataset.json m = json.dumps(j) sample_dataset = json.loads(m) nlu_engine = SnipsNLUEngine(config=CONFIG_EN) nlu_engine = nlu_engine.fit(sample_dataset, force_retrain=False) class MakeYAMLFromDB(APIView): def get(self, request): global nlu_engine d = request.data el = [] entities = list(Entity.objects.all().values( 'name', 'automatically_extensible', 'use_synonyms', 'matching_strictness'))
def test_preprocess_for_training(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": " yo " }, { "text": " yo " }, { "text": "yo " }, { "text": "Éntity_2", "entity": "entity_2" }, { "text": " " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }, { "text": " " }, { "text": "2", "entity": "snips/number" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird birdy" }] }] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances, training=True) processed_data = list(zip(*processed_data)) # Then u_0 = { "data": [{ "text": "hello world" }, { "text": "yo" }, { "text": "yo" }, { "text": "yo" }, { "text": "entity_2", "entity": "entity_2" }, { "text": "" }, { "text": "entity_2", "entity": "entity_2" }] } u_1 = { "data": [{ "text": "beauty world" }, { "text": "ent 1", "entity": "entity_1" }, { "text": "" }, { "text": "2", "entity": "snips/number" }] } u_2 = {"data": [{"text": "bird bird"}]} ent_00 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 23, "end": 31 } } ent_01 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 32, "end": 40 } } ent_1 = { "entity_kind": "entity_1", "value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "2", "range": { "start": 25, "end": 26 } } expected_data = [(u_0, [], [ent_00, ent_01], []), (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_2, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def test_get_entity_scopes(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - meeting [schedule_time:snips/datetime](today) --- type: intent name: intent2 utterances: - hello world --- type: intent name: intent3 utterances: - what will be the weather [weather_time:snips/datetime](tomorrow) --- type: intent name: intent4 utterances: - find a flight for [city](Paris) [flight_time:snips/datetime](tomorrow)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # When entity_scopes = _get_entity_scopes(dataset) # Then expected_scopes = [ { "entity_scope": { "builtin": ["snips/datetime"], "custom": [] }, "intent_group": ["intent1", "intent3"] }, { "entity_scope": { "builtin": [], "custom": [] }, "intent_group": ["intent2"] }, { "entity_scope": { "builtin": ["snips/datetime"], "custom": ["city"] }, "intent_group": ["intent4"] } ] def sort_key(group_scope): return " ".join(group_scope["intent_group"]) self.assertListEqual(sorted(expected_scopes, key=sort_key), sorted(entity_scopes, key=sort_key))