Example #1
0
def train_test_metrics(train_dataset_path,
                       test_dataset_path,
                       output_path,
                       config_path=None,
                       exclude_slot_metrics=False,
                       include_errors=False,
                       verbose=False):
    if verbose:
        set_nlu_logger(logging.DEBUG)

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(train_dataset=train_dataset_path,
                        test_dataset=test_dataset_path,
                        engine_class=engine_cls,
                        include_slot_metrics=not exclude_slot_metrics)

    from snips_nlu_metrics import compute_train_test_metrics

    metrics = compute_train_test_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()

        crf_model_file = None
        if self.crf_model is not None:
            crf_model_file = CRF_MODEL_FILENAME
            destination = path / crf_model_file
            shutil.copy(self.crf_model.modelfile.name, str(destination))
            # On windows, permissions of crfsuite files are correct
            if os.name == "posix":
                umask = os.umask(0o022)  # retrieve the system umask
                os.umask(umask)  # restore the sys umask to its original value
                os.chmod(str(destination), 0o644 & ~umask)

        model = {
            "language_code": self.language,
            "intent": self.intent,
            "crf_model_file": crf_model_file,
            "slot_name_mapping": self.slot_name_mapping,
            "config": self.config.to_dict(),
        }
        model_json = json_string(model)
        model_path = path / "slot_filler.json"
        with model_path.open(mode="w", encoding="utf8") as f:
            f.write(model_json)
        self.persist_metadata(path)
Example #3
0
    def setUp(self):
        super(TestCLI, self).setUp()
        if not self.fixture_dir.exists():
            self.fixture_dir.mkdir()

        dataset_stream = io.StringIO(u"""
---
type: intent
name: MakeTea
utterances:
  - make me a [beverage_temperature:Temperature](hot) cup of tea
  - make me [number_of_cups:snips/number](five) tea cups
  - i want [number_of_cups] cups of [beverage_temperature](boiling hot) tea pls
  - can you prepare [number_of_cups] cup of [beverage_temperature](cold) tea ?

---
type: intent
name: MakeCoffee
utterances:
  - make me [number_of_cups:snips/number](one) cup of coffee please
  - brew [number_of_cups] cups of coffee
  - can you prepare [number_of_cups] cup of coffee""")
        beverage_dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        self.beverage_dataset_path = self.fixture_dir / "beverage_dataset.json"
        if self.beverage_dataset_path.exists():
            self.beverage_dataset_path.unlink()
        with self.beverage_dataset_path.open(mode="w") as f:
            f.write(json_string(beverage_dataset))

        self.tmp_file_path = self.fixture_dir / next(
            tempfile._get_candidate_names())
        while self.tmp_file_path.exists():
            self.tmp_file_path = self.fixture_dir / next(
                tempfile._get_candidate_names())
def _build_gazetteer_parser(target_dir, gazetteer_entities, language):
    from snips_nlu_parsers import get_builtin_entity_shortname

    gazetteer_parser_name = "gazetteer_entity_parser"
    gazetteer_parser_path = target_dir / gazetteer_parser_name
    gazetteer_parser_metadata = []
    for ent in sorted(gazetteer_entities):
        # Fetch the compiled parser in the resources
        source_parser_path = find_gazetteer_entity_data_path(language, ent)
        short_name = get_builtin_entity_shortname(ent).lower()
        target_parser_path = gazetteer_parser_path / short_name
        parser_metadata = {
            "entity_identifier": ent,
            "entity_parser": short_name
        }
        gazetteer_parser_metadata.append(parser_metadata)
        # Copy the single entity parser
        shutil.copytree(str(source_parser_path), str(target_parser_path))
    # Dump the parser metadata
    gazetteer_entity_parser_metadata = {
        "parsers_metadata": gazetteer_parser_metadata
    }
    gazetteer_parser_metadata_path = gazetteer_parser_path / "metadata.json"
    with gazetteer_parser_metadata_path.open("w", encoding="utf-8") as f:
        f.write(json_string(gazetteer_entity_parser_metadata))
    return gazetteer_parser_name
Example #5
0
    def test_should_be_deserializable(self, mocked_cooccurrence_load,
                                      mocked_tfidf_load):
        # Given
        mocked_tfidf_load.return_value = "tfidf_vectorizer"
        mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer"

        language = LANGUAGE_EN
        config = FeaturizerConfig()

        featurizer_dict = {
            "language_code": language,
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }

        self.tmp_file_path.mkdir()
        featurizer_path = self.tmp_file_path / "featurizer.json"
        with featurizer_path.open("w", encoding="utf-8") as f:
            f.write(json_string(featurizer_dict))

        # When
        featurizer = Featurizer.from_path(self.tmp_file_path)

        # Then
        self.assertEqual(language, featurizer.language)
        self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer)
        self.assertEqual("cooccurrence_vectorizer",
                         featurizer.cooccurrence_vectorizer)
        self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
Example #6
0
    def persist(self, path):
        path.mkdir()

        # Persist the vectorizers
        tfidf_vectorizer = None
        if self.tfidf_vectorizer:
            tfidf_vectorizer = self.tfidf_vectorizer.unit_name
            tfidf_vectorizer_path = path / tfidf_vectorizer
            self.tfidf_vectorizer.persist(tfidf_vectorizer_path)

        cooccurrence_vectorizer = None
        if self.cooccurrence_vectorizer:
            cooccurrence_vectorizer = self.cooccurrence_vectorizer.unit_name
            cooccurrence_vectorizer_path = path / cooccurrence_vectorizer
            self.cooccurrence_vectorizer.persist(cooccurrence_vectorizer_path)

        # Persist main object
        self_as_dict = {
            "language_code": self.language,
            "tfidf_vectorizer": tfidf_vectorizer,
            "cooccurrence_vectorizer": cooccurrence_vectorizer,
            "config": self.config.to_dict()
        }

        featurizer_path = path / "featurizer.json"
        with featurizer_path.open("w", encoding="utf-8") as f:
            f.write(json_string(self_as_dict))

        # Persist metadata
        self.persist_metadata(path)
Example #7
0
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()

        featurizer = None
        if self.featurizer is not None:
            featurizer = "featurizer"
            featurizer_path = path / featurizer
            self.featurizer.persist(featurizer_path)

        coeffs = None
        intercept = None
        t_ = None
        if self.classifier is not None:
            coeffs = self.classifier.coef_.tolist()
            intercept = self.classifier.intercept_.tolist()
            t_ = self.classifier.t_

        self_as_dict = {
            "config": self.config.to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": t_,
            "intent_list": self.intent_list,
            "featurizer": featurizer
        }

        classifier_json = json_string(self_as_dict)
        with (path / "intent_classifier.json").open(mode="w") as f:
            f.write(classifier_json)
        self.persist_metadata(path)
Example #8
0
    def persist(self, path):
        path.mkdir()

        vectorizer_ = None
        if self._tfidf_vectorizer is not None:
            vocab = {k: int(v) for k, v in iteritems(self.vocabulary)}
            idf_diag = self.idf_diag.tolist()
            vectorizer_ = {
                "vocab": vocab,
                "idf_diag": idf_diag
            }

        builtin_entity_scope = None
        if self.builtin_entity_scope is not None:
            builtin_entity_scope = list(self.builtin_entity_scope)

        self_as_dict = {
            "vectorizer": vectorizer_,
            "language_code": self.language,
            "builtin_entity_scope": builtin_entity_scope,
            "config": self.config.to_dict(),
        }

        vectorizer_path = path / "vectorizer.json"
        with vectorizer_path.open("w", encoding="utf-8") as f:
            f.write(json_string(self_as_dict))
        self.persist_metadata(path)
Example #9
0
def generate_dataset(language, *files):
    """Create a Snips NLU dataset from text friendly files"""
    language = unicode_string(language)
    if any(f.endswith(".yml") or f.endswith(".yaml") for f in files):
        dataset = Dataset.from_yaml_files(language, list(files))
    else:
        dataset = Dataset.from_files(language, list(files))
    print(json_string(dataset.json, indent=2, sort_keys=True))
Example #10
0
 def persist(self, path):
     model = {
         "language": self.language,
         "slots_keywords": self.slots_keywords,
         "config": self.config.to_dict()
     }
     with path.open(mode="w") as f:
         f.write(json_string(model))
def print_parsing_result(engine, query, intents_filter):
    from snips_nlu.common.utils import unicode_string, json_string

    query = unicode_string(query)
    json_dump = json_string(engine.parse(query, intents_filter),
                            sort_keys=True,
                            indent=2)
    print(json_dump)
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()
        parser_json = json_string(self.to_dict())
        parser_path = path / "intent_parser.json"

        with parser_path.open(mode="w") as f:
            f.write(parser_json)
        self.persist_metadata(path)
Example #13
0
 def persist(self, path):
     path = Path(path)
     path.mkdir()
     parser_directory = "parser"
     metadata = {
         "language": self.language,
         "parser_usage": self.parser_usage.value,
         "parser_directory": parser_directory
     }
     with (path / "metadata.json").open(mode="w", encoding="utf8") as f:
         f.write(json_string(metadata))
     self._parser.persist(path / parser_directory)
Example #14
0
def persist_resources(resources, resources_dest_path, required_resources):
    if not required_resources:
        return

    resources_dest_path.mkdir()
    metadata = deepcopy(resources[METADATA])

    # Update metadata and keep only required resources
    if not required_resources.get(NOISE, False):
        metadata[NOISE] = None
    if not required_resources.get(STOP_WORDS, False):
        metadata[STOP_WORDS] = None
    if not required_resources.get(STEMS, False):
        metadata[STEMS] = None

    metadata[GAZETTEERS] = sorted(required_resources.get(GAZETTEERS, []))
    metadata[WORD_CLUSTERS] = sorted(required_resources.get(WORD_CLUSTERS, []))
    metadata_dest_path = resources_dest_path / "metadata.json"
    metadata_json = json_string(metadata)
    with metadata_dest_path.open(encoding="utf8", mode="w") as f:
        f.write(metadata_json)

    if metadata[NOISE] is not None:
        noise_path = (resources_dest_path / metadata[NOISE]) \
            .with_suffix(".txt")
        _persist_noise(get_noise(resources), noise_path)

    if metadata[STOP_WORDS] is not None:
        stop_words_path = (resources_dest_path / metadata[STOP_WORDS]) \
            .with_suffix(".txt")
        _persist_stop_words(get_stop_words(resources), stop_words_path)

    if metadata[STEMS] is not None:
        stemming_dir = resources_dest_path / "stemming"
        stemming_dir.mkdir()
        stems_path = (stemming_dir / metadata[STEMS]).with_suffix(".txt")
        _persist_stems(get_stems(resources), stems_path)

    if metadata[GAZETTEERS]:
        gazetteers_dir = resources_dest_path / "gazetteers"
        gazetteers_dir.mkdir()
        for name in metadata[GAZETTEERS]:
            gazetteer_path = (gazetteers_dir / name).with_suffix(".txt")
            _persist_gazetteer(get_gazetteer(resources, name), gazetteer_path)

    if metadata[WORD_CLUSTERS]:
        clusters_dir = resources_dest_path / "word_clusters"
        clusters_dir.mkdir()
        for name in metadata[WORD_CLUSTERS]:
            clusters_path = (clusters_dir / name).with_suffix(".txt")
            _persist_word_clusters(get_word_cluster(resources, name),
                                   clusters_path)
def generate_dataset(language, *yaml_files):
    """Creates a Snips NLU dataset from YAML definition files

    Check :meth:`.Intent.from_yaml` and :meth:`.Entity.from_yaml` for the
    format of the YAML files.

    Args:
        language (str): language of the dataset (iso code)
        *yaml_files: list of intent and entity definition files in YAML format.

    Returns:
        None. The json dataset output is printed out on stdout.
    """
    language = unicode_string(language)
    dataset = Dataset.from_yaml_files(language, list(yaml_files))
    print(json_string(dataset.json, indent=2, sort_keys=True))
Example #16
0
def _build_builtin_parser(language, gazetteer_entities):
    with temp_dir() as serialization_dir:
        gazetteer_entity_parser = None
        if gazetteer_entities:
            gazetteer_entity_parser = _build_gazetteer_parser(
                serialization_dir, gazetteer_entities, language)

        metadata = {
            "language": language.upper(),
            "gazetteer_parser": gazetteer_entity_parser
        }
        metadata_path = serialization_dir / "metadata.json"
        with metadata_path.open("w", encoding="utf-8") as f:
            f.write(json_string(metadata))
        parser = _BuiltinEntityParser.from_path(serialization_dir)
        return BuiltinEntityParser(parser)
Example #17
0
def cross_val_metrics(dataset_path,
                      output_path,
                      config_path=None,
                      nb_folds=5,
                      train_size_ratio=1.0,
                      exclude_slot_metrics=False,
                      include_errors=False,
                      verbose=0):
    import json
    import logging
    from pathlib import Path
    from snips_nlu_metrics import compute_cross_val_metrics
    from snips_nlu import SnipsNLUEngine
    from snips_nlu.cli.utils import set_nlu_logger
    from snips_nlu.common.utils import json_string

    if verbose == 1:
        set_nlu_logger(logging.INFO)
    elif verbose >= 2:
        set_nlu_logger(logging.DEBUG)

    def progression_handler(progress):
        print("%d%%" % int(progress * 100))

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(dataset=dataset_path,
                        engine_class=engine_cls,
                        progression_handler=progression_handler,
                        nb_folds=nb_folds,
                        train_size_ratio=train_size_ratio,
                        include_slot_metrics=not exclude_slot_metrics,
                        slot_matching_lambda=_match_trimmed_values)

    metrics = compute_cross_val_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Example #18
0
    def persist(self, path):
        path.mkdir()

        builtin_entity_scope = None
        if self.builtin_entity_scope is not None:
            builtin_entity_scope = list(self.builtin_entity_scope)

        self_as_dict = {
            "language_code": self.language,
            "word_pairs": {i: list(p)
                           for p, i in iteritems(self.word_pairs)},
            "builtin_entity_scope": builtin_entity_scope,
            "config": self.config.to_dict()
        }
        vectorizer_json = json_string(self_as_dict)
        vectorizer_path = path / "vectorizer.json"
        with vectorizer_path.open(mode="w") as f:
            f.write(vectorizer_json)
        self.persist_metadata(path)
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()

        crf_model_file = None
        if self.crf_model is not None:
            destination = path / Path(self.crf_model.modelfile.name).name
            shutil.copy(self.crf_model.modelfile.name, str(destination))
            crf_model_file = str(destination.name)

        model = {
            "language_code": self.language,
            "intent": self.intent,
            "crf_model_file": crf_model_file,
            "slot_name_mapping": self.slot_name_mapping,
            "config": self.config.to_dict(),
        }
        model_json = json_string(model)
        model_path = path / "slot_filler.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)
        self.persist_metadata(path)
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()
        sorted_slot_fillers = sorted(iteritems(self.slot_fillers))
        slot_fillers = []
        for i, (intent, slot_filler) in enumerate(sorted_slot_fillers):
            slot_filler_name = "slot_filler_%s" % i
            slot_filler.persist(path / slot_filler_name)
            slot_fillers.append({
                "intent": intent,
                "slot_filler_name": slot_filler_name
            })

        if self.intent_classifier is not None:
            self.intent_classifier.persist(path / "intent_classifier")

        model = {"config": self.config.to_dict(), "slot_fillers": slot_fillers}
        model_json = json_string(model)
        model_path = path / "intent_parser.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)
        self.persist_metadata(path)
Example #21
0
def train_test_metrics(train_dataset_path,
                       test_dataset_path,
                       output_path,
                       config_path=None,
                       exclude_slot_metrics=False,
                       include_errors=False,
                       verbosity=0):
    import json
    import logging
    from pathlib import Path
    from snips_nlu_metrics import compute_train_test_metrics
    from snips_nlu import SnipsNLUEngine
    from snips_nlu.cli.utils import set_nlu_logger
    from snips_nlu.common.utils import json_string

    if verbosity == 1:
        set_nlu_logger(logging.INFO)
    elif verbosity >= 2:
        set_nlu_logger(logging.DEBUG)

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(train_dataset=train_dataset_path,
                        test_dataset=test_dataset_path,
                        engine_class=engine_cls,
                        include_slot_metrics=not exclude_slot_metrics,
                        slot_matching_lambda=_match_trimmed_values)

    metrics = compute_train_test_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Example #22
0
def cross_val_metrics(dataset_path,
                      output_path,
                      config_path=None,
                      nb_folds=5,
                      train_size_ratio=1.0,
                      exclude_slot_metrics=False,
                      include_errors=False,
                      verbose=False):
    if verbose:
        set_nlu_logger(logging.DEBUG)

    def progression_handler(progress):
        print("%d%%" % int(progress * 100))

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(
        dataset=dataset_path,
        engine_class=engine_cls,
        progression_handler=progression_handler,
        nb_folds=nb_folds,
        train_size_ratio=train_size_ratio,
        include_slot_metrics=not exclude_slot_metrics,
    )

    from snips_nlu_metrics import compute_cross_val_metrics

    metrics = compute_cross_val_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Example #23
0
    def persist(self, path):
        """Persists the NLU engine at the given directory path

        Args:
            path (str or pathlib.Path): the location at which the nlu engine
                must be persisted. This path must not exist when calling this
                function.

        Raises:
            PersistingError: when persisting to a path which already exists
        """
        path.mkdir()

        parsers_count = defaultdict(int)
        intent_parsers = []
        for parser in self.intent_parsers:
            parser_name = parser.unit_name
            parsers_count[parser_name] += 1
            count = parsers_count[parser_name]
            if count > 1:
                parser_name = "{n}_{c}".format(n=parser_name, c=count)
            parser_path = path / parser_name
            parser.persist(parser_path)
            intent_parsers.append(parser_name)

        config = None
        if self.config is not None:
            config = self.config.to_dict()

        builtin_entity_parser = None
        if self.builtin_entity_parser is not None:
            builtin_entity_parser = "builtin_entity_parser"
            builtin_entity_parser_path = path / builtin_entity_parser
            self.builtin_entity_parser.persist(builtin_entity_parser_path)

        custom_entity_parser = None
        if self.custom_entity_parser is not None:
            custom_entity_parser = "custom_entity_parser"
            custom_entity_parser_path = path / custom_entity_parser
            self.custom_entity_parser.persist(custom_entity_parser_path)

        model = {
            "unit_name": self.unit_name,
            "dataset_metadata": self.dataset_metadata,
            "intent_parsers": intent_parsers,
            "custom_entity_parser": custom_entity_parser,
            "builtin_entity_parser": builtin_entity_parser,
            "config": config,
            "model_version": __model_version__,
            "training_package_version": __version__
        }

        model_json = json_string(model)
        model_path = path / "nlu_engine.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)

        if self.fitted:
            required_resources = self.config.get_required_resources()
            language = self.dataset_metadata["language_code"]
            resources_path = path / "resources"
            resources_path.mkdir()
            persist_resources(self.resources, resources_path / language,
                              required_resources)
Example #24
0
 def writeJsonContent(path, json_dict):
     json_content = json_string(json_dict)
     with path.open(mode="w", encoding="utf8") as f:
         f.write(json_content)
Example #25
0
 def persist(self, path):
     with path.open("r", encoding="utf-8") as f:
         f.write(json_string(self.entities))
Example #26
0
 def persist(self, path):
     path = Path(path)
     path.mkdir()
     with (path / "metadata.json").open(mode="w", encoding="utf8") as f:
         unit_dict = {"unit_name": self.unit_name, "fitted": self.fitted}
         f.write(json_string(unit_dict))
Example #27
0
 def persist_metadata(self, path, **kwargs):
     metadata = {"unit_name": self.unit_name}
     metadata.update(kwargs)
     metadata_json = json_string(metadata)
     with (path / "metadata.json").open(mode="w") as f:
         f.write(metadata_json)
Example #28
0
def print_parsing_result(engine, query):
    query = unicode_string(query)
    json_dump = json_string(engine.parse(query), sort_keys=True, indent=2)
    print(json_dump)