def train_test_metrics(train_dataset_path, test_dataset_path, output_path, config_path=None, exclude_slot_metrics=False, include_errors=False, verbose=False): if verbose: set_nlu_logger(logging.DEBUG) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict(train_dataset=train_dataset_path, test_dataset=test_dataset_path, engine_class=engine_cls, include_slot_metrics=not exclude_slot_metrics) with Path(train_dataset_path).open("r", encoding="utf8") as f: load_resources(json.load(f)["language"]) from snips_nlu_metrics import compute_train_test_metrics metrics = compute_train_test_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): """Persist the object at the given path""" path = Path(path) path.mkdir() slot_fillers = [] for intent, slot_filler in iteritems(self.slot_fillers): slot_filler_name = "slot_filler_%s" % intent slot_filler.persist(path / slot_filler_name) slot_fillers.append({ "intent": intent, "slot_filler_name": slot_filler_name }) # Only needed to improve testability slot_fillers = sorted(slot_fillers, key=lambda sf: sf["intent"]) if self.intent_classifier is not None: self.intent_classifier.persist(path / "intent_classifier") model = { "unit_name": self.unit_name, "config": self.config.to_dict(), "slot_fillers": slot_fillers } model_json = json_string(model) model_path = path / "intent_parser.json" with model_path.open(mode="w") as f: f.write(model_json) self.persist_metadata(path)
def cross_val_metrics(dataset_path, output_path, nb_folds=5, train_size_ratio=1.0, exclude_slot_metrics=False, include_errors=False): def progression_handler(progress): print("%d%%" % int(progress * 100)) metrics_args = dict( dataset=dataset_path, engine_class=SnipsNLUEngine, progression_handler=progression_handler, nb_folds=nb_folds, train_size_ratio=train_size_ratio, include_slot_metrics=not exclude_slot_metrics, ) with Path(dataset_path).open("r", encoding="utf8") as f: load_resources(json.load(f)["language"]) from snips_nlu_metrics import compute_cross_val_metrics metrics = compute_cross_val_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): """Persist the object at the given path""" path = Path(path) path.mkdir() sorted_slot_fillers = sorted(iteritems(self.slot_fillers)) slot_fillers = [] for i, (intent, slot_filler) in enumerate(sorted_slot_fillers): slot_filler_name = "slot_filler_%s" % i slot_filler.persist(path / slot_filler_name) slot_fillers.append({ "intent": intent, "slot_filler_name": slot_filler_name }) if self.intent_classifier is not None: self.intent_classifier.persist(path / "intent_classifier") model = { "config": self.config.to_dict(), "slot_fillers": slot_fillers } model_json = json_string(model) model_path = path / "intent_parser.json" with model_path.open(mode="w") as f: f.write(model_json) self.persist_metadata(path)
def persist(self, path): """Persist the object at the given path""" path = Path(path) path.mkdir() classifier_json = json_string(self.to_dict()) with (path / "intent_classifier.json").open(mode="w") as f: f.write(classifier_json) self.persist_metadata(path)
def persist(self, path): path = Path(path) path.mkdir() parser_directory = "parser" metadata = { "language": self.language, "parser_usage": self.parser_usage.value, "parser_directory": parser_directory } with (path / "metadata.json").open(mode="w", encoding="utf8") as f: f.write(json_string(metadata)) self._parser.persist(path / parser_directory)
def _build_builtin_parser(language, gazetteer_entities): with temp_dir() as serialization_dir: gazetteer_entity_parser = None if gazetteer_entities: gazetteer_entity_parser = _build_gazetteer_parser( serialization_dir, gazetteer_entities, language) metadata = { "language": language.upper(), "gazetteer_parser": gazetteer_entity_parser } metadata_path = serialization_dir / "metadata.json" with metadata_path.open("w", encoding="utf-8") as f: f.write(json_string(metadata)) parser = _BuiltinEntityParser.from_path(serialization_dir) return BuiltinEntityParser(parser)
def persist(self, path): """Persist the NLU engine at the given directory path Args: path (str): the location at which the nlu engine must be persisted. This path must not exist when calling this function. """ directory_path = Path(path) directory_path.mkdir() parsers_count = defaultdict(int) intent_parsers = [] for parser in self.intent_parsers: parser_name = parser.unit_name parsers_count[parser_name] += 1 count = parsers_count[parser_name] if count > 1: parser_name = "{n}_{c}".format(n=parser_name, c=count) parser_path = directory_path / parser_name parser.persist(parser_path) intent_parsers.append(parser_name) config = None if self.config is not None: config = self.config.to_dict() model = { "unit_name": self.unit_name, "dataset_metadata": self._dataset_metadata, "intent_parsers": intent_parsers, "config": config, "model_version": __model_version__, "training_package_version": __version__ } model_json = json_string(model) model_path = directory_path / "nlu_engine.json" with model_path.open(mode="w") as f: f.write(model_json) if self.fitted: required_resources = self.config.get_required_resources() if required_resources: language = self._dataset_metadata["language_code"] resources_path = directory_path / "resources" resources_path.mkdir() persist_resources(resources_path / language, required_resources, language)
def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5, train_size_ratio=1.0, exclude_slot_metrics=False, include_errors=False, verbose=False): if verbose: set_nlu_logger(logging.DEBUG) def progression_handler(progress): print("%d%%" % int(progress * 100)) if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) engine_cls = make_engine_cls(config) else: engine_cls = SnipsNLUEngine metrics_args = dict( dataset=dataset_path, engine_class=engine_cls, progression_handler=progression_handler, nb_folds=nb_folds, train_size_ratio=train_size_ratio, include_slot_metrics=not exclude_slot_metrics, ) with Path(dataset_path).open("r", encoding="utf8") as f: load_resources(json.load(f)["language"]) from snips_nlu_metrics import compute_cross_val_metrics metrics = compute_cross_val_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def train_test_metrics(train_dataset_path, test_dataset_path, output_path, exclude_slot_metrics=False, include_errors=False): metrics_args = dict(train_dataset=train_dataset_path, test_dataset=test_dataset_path, engine_class=SnipsNLUEngine, include_slot_metrics=not exclude_slot_metrics) with Path(train_dataset_path).open("r", encoding="utf8") as f: load_resources(json.load(f)["language"]) from snips_nlu_metrics import compute_train_test_metrics metrics = compute_train_test_metrics(**metrics_args) if not include_errors: metrics.pop("parsing_errors") with Path(output_path).open(mode="w", encoding="utf8") as f: f.write(json_string(metrics))
def persist(self, path): """Persist the object at the given path""" path = Path(path) path.mkdir() crf_model_file = None if self.crf_model is not None: destination = path / Path(self.crf_model.modelfile.name).name shutil.copy(self.crf_model.modelfile.name, str(destination)) crf_model_file = str(destination.name) model = { "language_code": self.language, "intent": self.intent, "crf_model_file": crf_model_file, "slot_name_mapping": self.slot_name_mapping, "config": self.config.to_dict(), } model_json = json_string(model) model_path = path / "slot_filler.json" with model_path.open(mode="w") as f: f.write(model_json) self.persist_metadata(path)
def _build_gazetteer_parser(target_dir, gazetteer_entities, language): gazetteer_parser_name = "gazetteer_entity_parser" gazetteer_parser_path = target_dir / gazetteer_parser_name gazetteer_parser_metadata = [] for ent in sorted(gazetteer_entities): # Fetch the compiled parser in the resources source_parser_path = find_gazetteer_entity_data_path(language, ent) short_name = get_builtin_entity_shortname(ent).lower() target_parser_path = gazetteer_parser_path / short_name parser_metadata = { "entity_identifier": ent, "entity_parser": short_name } gazetteer_parser_metadata.append(parser_metadata) # Copy the single entity parser shutil.copytree(str(source_parser_path), str(target_parser_path)) # Dump the parser metadata gazetteer_entity_parser_metadata = { "parsers_metadata": gazetteer_parser_metadata } gazetteer_parser_metadata_path = gazetteer_parser_path / "metadata.json" with gazetteer_parser_metadata_path.open("w", encoding="utf-8") as f: f.write(json_string(gazetteer_entity_parser_metadata)) return gazetteer_parser_name
def test_should_be_serializable(self): # Given language = LANGUAGE_EN tfidf_vectorizer = _get_tfidf_vectorizer(language) pvalue_threshold = 0.42 featurizer = Featurizer(language, config=FeaturizerConfig( pvalue_threshold=pvalue_threshold, word_clusters_name="brown_clusters"), unknown_words_replacement_string=None, tfidf_vectorizer=tfidf_vectorizer) dataset = { "entities": { "entity2": { "data": [{ "value": "entity1", "synonyms": ["entity1"] }], "use_synonyms": True, "automatically_extensible": True } }, "intents": {}, "language": "en" } dataset = validate_and_format_dataset(dataset) utterances = [ "hello world", "beautiful world", "hello here", "bird birdy", "beautiful bird" ] utterances = [text_to_utterance(u) for u in utterances] classes = np.array([0, 0, 0, 1, 1]) featurizer.fit(dataset, utterances, classes) # When serialized_featurizer = featurizer.to_dict() # Then msg = "Featurizer dict should be json serializable to utf8." with self.fail_if_exception(msg): dumped = json_string(serialized_featurizer) msg = "SnipsNLUEngine should be deserializable from dict with unicode" \ " values" with self.fail_if_exception(msg): _ = Featurizer.from_dict(json.loads(dumped)) vocabulary = tfidf_vectorizer.vocabulary_ # pylint: disable=W0212 idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist() # pylint: enable=W0212 best_features = featurizer.best_features entity_utterances_to_feature_names = { "entity1": ["entityfeatureentity2"] } expected_serialized = { "config": { 'sublinear_tf': False, 'pvalue_threshold': pvalue_threshold, 'word_clusters_name': "brown_clusters" }, "language_code": "en", "tfidf_vectorizer": { "idf_diag": idf_diag, "vocab": vocabulary }, "best_features": best_features, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } self.assertDictEqual(expected_serialized, serialized_featurizer)
def persist_metadata(self, path, **kwargs): metadata = {"unit_name": self.unit_name} metadata.update(kwargs) metadata_json = json_string(metadata) with (path / "metadata.json").open(mode="w") as f: f.write(metadata_json)
def persist(self, path): path = Path(path) path.mkdir() with (path / "metadata.json").open(mode="w") as f: f.write(json_string({"unit_name": self.unit_name}))
def writeJsonContent(path, json_dict): json_content = json_string(json_dict) with path.open(mode="w") as f: f.write(json_content)
def persist_resources(resources_dest_path, required_resources, language): if not required_resources: return resources_dest_path.mkdir() resources_src_path = Path(get_resources_dir(language)) with (resources_src_path / "metadata.json").open(encoding="utf8") as f: metadata = json.load(f) # Update metadata and keep only required resources if not required_resources.get(NOISE, False): metadata[NOISE] = None if not required_resources.get(STOP_WORDS, False): metadata[STOP_WORDS] = None if not required_resources.get(STEMS, False): metadata[STEMS] = None metadata[GAZETTEERS] = sorted(required_resources.get(GAZETTEERS, [])) metadata[WORD_CLUSTERS] = sorted(required_resources.get(WORD_CLUSTERS, [])) metadata_dest_path = resources_dest_path / "metadata.json" metadata_json = json_string(metadata) with metadata_dest_path.open(encoding="utf8", mode="w") as f: f.write(metadata_json) if metadata[NOISE] is not None: noise_src = (resources_src_path / metadata[NOISE]).with_suffix(".txt") noise_dest = (resources_dest_path / noise_src.name) shutil.copy(str(noise_src), str(noise_dest)) if metadata[STOP_WORDS] is not None: stop_words_src = (resources_src_path / metadata[STOP_WORDS]) \ .with_suffix(".txt") stop_words_dest = (resources_dest_path / stop_words_src.name) shutil.copy(str(stop_words_src), str(stop_words_dest)) if metadata[STEMS] is not None: stems_src = (resources_src_path / "stemming" / metadata["stems"]) \ .with_suffix(".txt") stemming_dir = resources_dest_path / "stemming" stemming_dir.mkdir() stems_dest = stemming_dir / stems_src.name shutil.copy(str(stems_src), str(stems_dest)) if metadata[GAZETTEERS]: gazetteer_src_dir = resources_src_path / "gazetteers" gazetteer_dest_dir = resources_dest_path / "gazetteers" gazetteer_dest_dir.mkdir() for gazetteer in metadata[GAZETTEERS]: gazetteer_src = (gazetteer_src_dir / gazetteer) \ .with_suffix(".txt") gazetteer_dest = gazetteer_dest_dir / gazetteer_src.name shutil.copy(str(gazetteer_src), str(gazetteer_dest)) if metadata[WORD_CLUSTERS]: clusters_src_dir = resources_src_path / "word_clusters" clusters_dest_dir = resources_dest_path / "word_clusters" clusters_dest_dir.mkdir() for word_clusters in metadata["word_clusters"]: clusters_src = (clusters_src_dir / word_clusters) \ .with_suffix(".txt") clusters_dest = clusters_dest_dir / clusters_src.name shutil.copy(str(clusters_src), str(clusters_dest))