Ejemplo n.º 1
0
def train_test_metrics(train_dataset_path,
                       test_dataset_path,
                       output_path,
                       config_path=None,
                       exclude_slot_metrics=False,
                       include_errors=False,
                       verbose=False):
    if verbose:
        set_nlu_logger(logging.DEBUG)

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(train_dataset=train_dataset_path,
                        test_dataset=test_dataset_path,
                        engine_class=engine_cls,
                        include_slot_metrics=not exclude_slot_metrics)

    with Path(train_dataset_path).open("r", encoding="utf8") as f:
        load_resources(json.load(f)["language"])

    from snips_nlu_metrics import compute_train_test_metrics

    metrics = compute_train_test_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Ejemplo n.º 2
0
    def persist(self, path):
        """Persist the object at the given path"""
        path = Path(path)
        path.mkdir()
        slot_fillers = []
        for intent, slot_filler in iteritems(self.slot_fillers):
            slot_filler_name = "slot_filler_%s" % intent
            slot_filler.persist(path / slot_filler_name)
            slot_fillers.append({
                "intent": intent,
                "slot_filler_name": slot_filler_name
            })

        # Only needed to improve testability
        slot_fillers = sorted(slot_fillers, key=lambda sf: sf["intent"])

        if self.intent_classifier is not None:
            self.intent_classifier.persist(path / "intent_classifier")

        model = {
            "unit_name": self.unit_name,
            "config": self.config.to_dict(),
            "slot_fillers": slot_fillers
        }
        model_json = json_string(model)
        model_path = path / "intent_parser.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)
        self.persist_metadata(path)
Ejemplo n.º 3
0
def cross_val_metrics(dataset_path,
                      output_path,
                      nb_folds=5,
                      train_size_ratio=1.0,
                      exclude_slot_metrics=False,
                      include_errors=False):
    def progression_handler(progress):
        print("%d%%" % int(progress * 100))

    metrics_args = dict(
        dataset=dataset_path,
        engine_class=SnipsNLUEngine,
        progression_handler=progression_handler,
        nb_folds=nb_folds,
        train_size_ratio=train_size_ratio,
        include_slot_metrics=not exclude_slot_metrics,
    )

    with Path(dataset_path).open("r", encoding="utf8") as f:
        load_resources(json.load(f)["language"])

    from snips_nlu_metrics import compute_cross_val_metrics

    metrics = compute_cross_val_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Ejemplo n.º 4
0
    def persist(self, path):
        """Persist the object at the given path"""
        path = Path(path)
        path.mkdir()
        sorted_slot_fillers = sorted(iteritems(self.slot_fillers))
        slot_fillers = []
        for i, (intent, slot_filler) in enumerate(sorted_slot_fillers):
            slot_filler_name = "slot_filler_%s" % i
            slot_filler.persist(path / slot_filler_name)
            slot_fillers.append({
                "intent": intent,
                "slot_filler_name": slot_filler_name
            })

        if self.intent_classifier is not None:
            self.intent_classifier.persist(path / "intent_classifier")

        model = {
            "config": self.config.to_dict(),
            "slot_fillers": slot_fillers
        }
        model_json = json_string(model)
        model_path = path / "intent_parser.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)
        self.persist_metadata(path)
Ejemplo n.º 5
0
 def persist(self, path):
     """Persist the object at the given path"""
     path = Path(path)
     path.mkdir()
     classifier_json = json_string(self.to_dict())
     with (path / "intent_classifier.json").open(mode="w") as f:
         f.write(classifier_json)
     self.persist_metadata(path)
Ejemplo n.º 6
0
 def persist(self, path):
     path = Path(path)
     path.mkdir()
     parser_directory = "parser"
     metadata = {
         "language": self.language,
         "parser_usage": self.parser_usage.value,
         "parser_directory": parser_directory
     }
     with (path / "metadata.json").open(mode="w", encoding="utf8") as f:
         f.write(json_string(metadata))
     self._parser.persist(path / parser_directory)
Ejemplo n.º 7
0
def _build_builtin_parser(language, gazetteer_entities):
    with temp_dir() as serialization_dir:
        gazetteer_entity_parser = None
        if gazetteer_entities:
            gazetteer_entity_parser = _build_gazetteer_parser(
                serialization_dir, gazetteer_entities, language)

        metadata = {
            "language": language.upper(),
            "gazetteer_parser": gazetteer_entity_parser
        }
        metadata_path = serialization_dir / "metadata.json"
        with metadata_path.open("w", encoding="utf-8") as f:
            f.write(json_string(metadata))
        parser = _BuiltinEntityParser.from_path(serialization_dir)
        return BuiltinEntityParser(parser)
Ejemplo n.º 8
0
    def persist(self, path):
        """Persist the NLU engine at the given directory path

        Args:
            path (str): the location at which the nlu engine must be persisted.
                This path must not exist when calling this function.
        """
        directory_path = Path(path)
        directory_path.mkdir()

        parsers_count = defaultdict(int)
        intent_parsers = []
        for parser in self.intent_parsers:
            parser_name = parser.unit_name
            parsers_count[parser_name] += 1
            count = parsers_count[parser_name]
            if count > 1:
                parser_name = "{n}_{c}".format(n=parser_name, c=count)
            parser_path = directory_path / parser_name
            parser.persist(parser_path)
            intent_parsers.append(parser_name)

        config = None
        if self.config is not None:
            config = self.config.to_dict()

        model = {
            "unit_name": self.unit_name,
            "dataset_metadata": self._dataset_metadata,
            "intent_parsers": intent_parsers,
            "config": config,
            "model_version": __model_version__,
            "training_package_version": __version__
        }
        model_json = json_string(model)
        model_path = directory_path / "nlu_engine.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)

        if self.fitted:
            required_resources = self.config.get_required_resources()
            if required_resources:
                language = self._dataset_metadata["language_code"]
                resources_path = directory_path / "resources"
                resources_path.mkdir()
                persist_resources(resources_path / language,
                                  required_resources, language)
Ejemplo n.º 9
0
def cross_val_metrics(dataset_path,
                      output_path,
                      config_path=None,
                      nb_folds=5,
                      train_size_ratio=1.0,
                      exclude_slot_metrics=False,
                      include_errors=False,
                      verbose=False):
    if verbose:
        set_nlu_logger(logging.DEBUG)

    def progression_handler(progress):
        print("%d%%" % int(progress * 100))

    if config_path is not None:
        with Path(config_path).open("r", encoding="utf-8") as f:
            config = json.load(f)
        engine_cls = make_engine_cls(config)
    else:
        engine_cls = SnipsNLUEngine

    metrics_args = dict(
        dataset=dataset_path,
        engine_class=engine_cls,
        progression_handler=progression_handler,
        nb_folds=nb_folds,
        train_size_ratio=train_size_ratio,
        include_slot_metrics=not exclude_slot_metrics,
    )

    with Path(dataset_path).open("r", encoding="utf8") as f:
        load_resources(json.load(f)["language"])

    from snips_nlu_metrics import compute_cross_val_metrics

    metrics = compute_cross_val_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Ejemplo n.º 10
0
def train_test_metrics(train_dataset_path,
                       test_dataset_path,
                       output_path,
                       exclude_slot_metrics=False,
                       include_errors=False):
    metrics_args = dict(train_dataset=train_dataset_path,
                        test_dataset=test_dataset_path,
                        engine_class=SnipsNLUEngine,
                        include_slot_metrics=not exclude_slot_metrics)

    with Path(train_dataset_path).open("r", encoding="utf8") as f:
        load_resources(json.load(f)["language"])

    from snips_nlu_metrics import compute_train_test_metrics

    metrics = compute_train_test_metrics(**metrics_args)
    if not include_errors:
        metrics.pop("parsing_errors")

    with Path(output_path).open(mode="w", encoding="utf8") as f:
        f.write(json_string(metrics))
Ejemplo n.º 11
0
    def persist(self, path):
        """Persist the object at the given path"""
        path = Path(path)
        path.mkdir()

        crf_model_file = None
        if self.crf_model is not None:
            destination = path / Path(self.crf_model.modelfile.name).name
            shutil.copy(self.crf_model.modelfile.name, str(destination))
            crf_model_file = str(destination.name)

        model = {
            "language_code": self.language,
            "intent": self.intent,
            "crf_model_file": crf_model_file,
            "slot_name_mapping": self.slot_name_mapping,
            "config": self.config.to_dict(),
        }
        model_json = json_string(model)
        model_path = path / "slot_filler.json"
        with model_path.open(mode="w") as f:
            f.write(model_json)
        self.persist_metadata(path)
Ejemplo n.º 12
0
def _build_gazetteer_parser(target_dir, gazetteer_entities, language):
    gazetteer_parser_name = "gazetteer_entity_parser"
    gazetteer_parser_path = target_dir / gazetteer_parser_name
    gazetteer_parser_metadata = []
    for ent in sorted(gazetteer_entities):
        # Fetch the compiled parser in the resources
        source_parser_path = find_gazetteer_entity_data_path(language, ent)
        short_name = get_builtin_entity_shortname(ent).lower()
        target_parser_path = gazetteer_parser_path / short_name
        parser_metadata = {
            "entity_identifier": ent,
            "entity_parser": short_name
        }
        gazetteer_parser_metadata.append(parser_metadata)
        # Copy the single entity parser
        shutil.copytree(str(source_parser_path), str(target_parser_path))
    # Dump the parser metadata
    gazetteer_entity_parser_metadata = {
        "parsers_metadata": gazetteer_parser_metadata
    }
    gazetteer_parser_metadata_path = gazetteer_parser_path / "metadata.json"
    with gazetteer_parser_metadata_path.open("w", encoding="utf-8") as f:
        f.write(json_string(gazetteer_entity_parser_metadata))
    return gazetteer_parser_name
Ejemplo n.º 13
0
    def test_should_be_serializable(self):
        # Given
        language = LANGUAGE_EN
        tfidf_vectorizer = _get_tfidf_vectorizer(language)

        pvalue_threshold = 0.42
        featurizer = Featurizer(language,
                                config=FeaturizerConfig(
                                    pvalue_threshold=pvalue_threshold,
                                    word_clusters_name="brown_clusters"),
                                unknown_words_replacement_string=None,
                                tfidf_vectorizer=tfidf_vectorizer)
        dataset = {
            "entities": {
                "entity2": {
                    "data": [{
                        "value": "entity1",
                        "synonyms": ["entity1"]
                    }],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "intents": {},
            "language": "en"
        }
        dataset = validate_and_format_dataset(dataset)

        utterances = [
            "hello world", "beautiful world", "hello here", "bird birdy",
            "beautiful bird"
        ]
        utterances = [text_to_utterance(u) for u in utterances]
        classes = np.array([0, 0, 0, 1, 1])

        featurizer.fit(dataset, utterances, classes)

        # When
        serialized_featurizer = featurizer.to_dict()

        # Then
        msg = "Featurizer dict should be json serializable to utf8."
        with self.fail_if_exception(msg):
            dumped = json_string(serialized_featurizer)

        msg = "SnipsNLUEngine should be deserializable from dict with unicode" \
              " values"
        with self.fail_if_exception(msg):
            _ = Featurizer.from_dict(json.loads(dumped))

        vocabulary = tfidf_vectorizer.vocabulary_
        # pylint: disable=W0212
        idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist()
        # pylint: enable=W0212

        best_features = featurizer.best_features
        entity_utterances_to_feature_names = {
            "entity1": ["entityfeatureentity2"]
        }

        expected_serialized = {
            "config": {
                'sublinear_tf': False,
                'pvalue_threshold': pvalue_threshold,
                'word_clusters_name': "brown_clusters"
            },
            "language_code": "en",
            "tfidf_vectorizer": {
                "idf_diag": idf_diag,
                "vocab": vocabulary
            },
            "best_features": best_features,
            "entity_utterances_to_feature_names":
            entity_utterances_to_feature_names,
            "unknown_words_replacement_string": None
        }
        self.assertDictEqual(expected_serialized, serialized_featurizer)
Ejemplo n.º 14
0
 def persist_metadata(self, path, **kwargs):
     metadata = {"unit_name": self.unit_name}
     metadata.update(kwargs)
     metadata_json = json_string(metadata)
     with (path / "metadata.json").open(mode="w") as f:
         f.write(metadata_json)
Ejemplo n.º 15
0
 def persist(self, path):
     path = Path(path)
     path.mkdir()
     with (path / "metadata.json").open(mode="w") as f:
         f.write(json_string({"unit_name": self.unit_name}))
Ejemplo n.º 16
0
 def writeJsonContent(path, json_dict):
     json_content = json_string(json_dict)
     with path.open(mode="w") as f:
         f.write(json_content)
Ejemplo n.º 17
0
def persist_resources(resources_dest_path, required_resources, language):
    if not required_resources:
        return

    resources_dest_path.mkdir()

    resources_src_path = Path(get_resources_dir(language))
    with (resources_src_path / "metadata.json").open(encoding="utf8") as f:
        metadata = json.load(f)

    # Update metadata and keep only required resources
    if not required_resources.get(NOISE, False):
        metadata[NOISE] = None
    if not required_resources.get(STOP_WORDS, False):
        metadata[STOP_WORDS] = None
    if not required_resources.get(STEMS, False):
        metadata[STEMS] = None

    metadata[GAZETTEERS] = sorted(required_resources.get(GAZETTEERS, []))
    metadata[WORD_CLUSTERS] = sorted(required_resources.get(WORD_CLUSTERS, []))
    metadata_dest_path = resources_dest_path / "metadata.json"
    metadata_json = json_string(metadata)
    with metadata_dest_path.open(encoding="utf8", mode="w") as f:
        f.write(metadata_json)

    if metadata[NOISE] is not None:
        noise_src = (resources_src_path / metadata[NOISE]).with_suffix(".txt")
        noise_dest = (resources_dest_path / noise_src.name)
        shutil.copy(str(noise_src), str(noise_dest))

    if metadata[STOP_WORDS] is not None:
        stop_words_src = (resources_src_path / metadata[STOP_WORDS]) \
            .with_suffix(".txt")
        stop_words_dest = (resources_dest_path / stop_words_src.name)
        shutil.copy(str(stop_words_src), str(stop_words_dest))

    if metadata[STEMS] is not None:
        stems_src = (resources_src_path / "stemming" / metadata["stems"]) \
            .with_suffix(".txt")
        stemming_dir = resources_dest_path / "stemming"
        stemming_dir.mkdir()
        stems_dest = stemming_dir / stems_src.name
        shutil.copy(str(stems_src), str(stems_dest))

    if metadata[GAZETTEERS]:
        gazetteer_src_dir = resources_src_path / "gazetteers"
        gazetteer_dest_dir = resources_dest_path / "gazetteers"
        gazetteer_dest_dir.mkdir()
        for gazetteer in metadata[GAZETTEERS]:
            gazetteer_src = (gazetteer_src_dir / gazetteer) \
                .with_suffix(".txt")
            gazetteer_dest = gazetteer_dest_dir / gazetteer_src.name
            shutil.copy(str(gazetteer_src), str(gazetteer_dest))

    if metadata[WORD_CLUSTERS]:
        clusters_src_dir = resources_src_path / "word_clusters"
        clusters_dest_dir = resources_dest_path / "word_clusters"
        clusters_dest_dir.mkdir()
        for word_clusters in metadata["word_clusters"]:
            clusters_src = (clusters_src_dir / word_clusters) \
                .with_suffix(".txt")
            clusters_dest = clusters_dest_dir / clusters_src.name
            shutil.copy(str(clusters_src), str(clusters_dest))