コード例 #1
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> SklearnIntentClassifier:
        """Loads trained component (see parent class for full docstring)."""
        from sklearn.preprocessing import LabelEncoder

        try:
            with model_storage.read_from(resource) as model_dir:
                file_name = cls.__name__
                classifier_file = model_dir / f"{file_name}_classifier.pkl"

                if classifier_file.exists():
                    classifier = io_utils.json_unpickle(classifier_file)

                    encoder_file = model_dir / f"{file_name}_encoder.pkl"
                    classes = io_utils.json_unpickle(encoder_file)
                    encoder = LabelEncoder()
                    encoder.classes_ = classes

                    return cls(config, model_storage, resource, classifier,
                               encoder)
        except ValueError:
            logger.debug(
                f"Failed to load '{cls.__name__}' from model storage. Resource "
                f"'{resource.name}' doesn't exist.")
        return cls(config, model_storage, resource)
コード例 #2
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> MitieIntentClassifierGraphComponent:
        """Loads component for inference see parent class for full docstring)."""
        import mitie

        text_categorizer = None

        try:
            with model_storage.read_from(resource) as directory:
                text_categorizer = mitie.text_categorizer(
                    str(directory / "model.dat"))
        except (
                ValueError,
                Exception,
        ):  # the latter is thrown by the `mitie.text_categorizer`
            logger.warning(
                f"Failed to load {cls.__class__.__name__} from model storage. Resource "
                f"'{resource.name}' doesn't exist.")

        return cls(config, model_storage, resource, text_categorizer)
コード例 #3
0
ファイル: test_graph_node.py プロジェクト: ChenHuaYou/rasa
def test_writing_to_resource_during_training(
        default_model_storage: ModelStorage):
    node_name = "some_name"

    test_value_for_sub_directory = {"test": "test value sub dir"}
    test_value = {"test dir": "test value dir"}

    node = GraphNode(
        node_name=node_name,
        component_class=PersistableTestComponent,
        constructor_name="create",
        component_config={
            "test_value": test_value,
            "test_value_for_sub_directory": test_value_for_sub_directory,
        },
        fn_name="train",
        inputs={},
        eager=False,
        model_storage=default_model_storage,
        resource=None,
        execution_context=ExecutionContext(GraphSchema({}), "123"),
    )

    _, resource = node()

    assert resource == Resource(node_name)

    with default_model_storage.read_from(resource) as directory:
        assert (rasa.shared.utils.io.read_json_file(directory /
                                                    "test.json") == test_value)
        assert (rasa.shared.utils.io.read_json_file(
            directory / "sub_dir" /
            "test.json") == test_value_for_sub_directory)
コード例 #4
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> CountVectorsFeaturizer:
        """Loads trained component (see parent class for full docstring)."""
        try:
            with model_storage.read_from(resource) as model_dir:
                featurizer_file = model_dir / "vocabularies.pkl"
                vocabulary = io_utils.json_unpickle(featurizer_file)

                share_vocabulary = config["use_shared_vocab"]

                if share_vocabulary:
                    vectorizers = cls._create_shared_vocab_vectorizers(
                        config, vocabulary=vocabulary
                    )
                else:
                    vectorizers = cls._create_independent_vocab_vectorizers(
                        config, vocabulary=vocabulary
                    )

                oov_words = rasa.shared.utils.io.read_json_file(
                    model_dir / "oov_words.json"
                )

                ftr = cls(
                    config,
                    model_storage,
                    resource,
                    execution_context,
                    vectorizers=vectorizers,
                    oov_token=config["OOV_token"],
                    oov_words=oov_words,
                )

                # make sure the vocabulary has been loaded correctly
                for attribute in vectorizers:
                    ftr.vectorizers[attribute]._validate_vocabulary()

                return ftr

        except (ValueError, FileNotFoundError, FileIOException):
            logger.debug(
                f"Failed to load `{cls.__class__.__name__}` from model storage. "
                f"Resource '{resource.name}' doesn't exist."
            )
            return cls(
                config=config,
                model_storage=model_storage,
                resource=resource,
                execution_context=execution_context,
            )
コード例 #5
0
def test_read_from_not_existing_resource(default_model_storage: ModelStorage):
    with default_model_storage.write_to(
            Resource("resource1")) as temporary_directory:
        file = temporary_directory / "file.txt"
        file.write_text("test")

    with pytest.raises(ValueError):
        with default_model_storage.read_from(
                Resource("a different resource")) as _:
            pass
コード例 #6
0
ファイル: resource.py プロジェクト: praneethgb/rasa
    def to_cache(self, directory: Path, model_storage: ModelStorage) -> None:
        """Persists the `Resource` to the cache.

        Args:
            directory: The directory which receives the persisted `Resource`.
            model_storage: The model storage which currently contains the persisted
                `Resource`.
        """
        with model_storage.read_from(self) as resource_directory:
            rasa.utils.common.copy_directory(resource_directory, directory)
コード例 #7
0
async def test_train_model_not_checkpointing(
    default_model_storage: ModelStorage,
    default_diet_resource: Resource,
    create_train_load_and_process_diet: Callable[..., Message],
):
    create_train_load_and_process_diet({EPOCHS: 1, CHECKPOINT_MODEL: False})

    with default_model_storage.read_from(default_diet_resource) as model_dir:
        all_files = list(model_dir.rglob("*.*"))
        assert not any(
            ["from_checkpoint" in str(filename) for filename in all_files])
コード例 #8
0
async def test_train_model_not_checkpointing(
    default_model_storage: ModelStorage,
    default_diet_resource: Resource,
    create_train_load_and_process_diet: Callable[..., Message],
):
    create_train_load_and_process_diet({EPOCHS: 2, CHECKPOINT_MODEL: False})

    with default_model_storage.read_from(default_diet_resource) as model_dir:
        checkpoint_dir = model_dir / "checkpoints"

        assert not checkpoint_dir.is_dir()
コード例 #9
0
ファイル: domain_provider.py プロジェクト: zoovu/rasa
 def load(
     cls,
     config: Dict[Text, Any],
     model_storage: ModelStorage,
     resource: Resource,
     execution_context: ExecutionContext,
     **kwargs: Any,
 ) -> DomainProvider:
     """Creates provider using a persisted version of itself."""
     with model_storage.read_from(resource) as resource_directory:
         domain = Domain.from_path(resource_directory)
     return cls(model_storage, resource, domain)
コード例 #10
0
def test_caching_empty_resource(
    default_model_storage: ModelStorage,
    tmp_path: Path,
    tmp_path_factory: TempPathFactory,
):
    resource_name = "my resource"
    resource = Resource(resource_name)

    # does not raise
    resource.to_cache(tmp_path, default_model_storage)

    with pytest.raises(ValueError):
        with default_model_storage.read_from(resource) as _:
            pass

    cache_dir = tmp_path_factory.mktemp("cache_dir")

    # this doesn't create an empty directory in `default_model_storage`
    Resource.from_cache(resource_name, cache_dir, default_model_storage,
                        resource.output_fingerprint)

    with pytest.raises(ValueError):
        with default_model_storage.read_from(resource) as _:
            pass
コード例 #11
0
ファイル: resource.py プロジェクト: zoovu/rasa
    def to_cache(self, directory: Path, model_storage: ModelStorage) -> None:
        """Persists the `Resource` to the cache.

        Args:
            directory: The directory which receives the persisted `Resource`.
            model_storage: The model storage which currently contains the persisted
                `Resource`.
        """
        try:
            with model_storage.read_from(self) as resource_directory:
                rasa.utils.common.copy_directory(resource_directory, directory)
        except ValueError:
            logger.debug(
                f"Skipped caching resource '{self.name}' as no persisted "
                f"data was found.")
コード例 #12
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> PersistableTestComponent:
        assert model_storage
        assert resource

        with model_storage.read_from(resource) as directory:
            eager_instantiated_value = rasa.shared.utils.io.read_json_file(
                directory / "test.json")
        return cls(config, model_storage, resource, eager_instantiated_value)
コード例 #13
0
def test_train_model_checkpointing(
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    default_model_storage: ModelStorage,
    train_and_preprocess: Callable[..., Tuple[TrainingData,
                                              List[GraphComponent]]],
):
    pipeline = [
        {
            "component": WhitespaceTokenizer
        },
        {
            "component": CountVectorsFeaturizer,
            "analyzer": "char_wb",
            "min_ngram": 3,
            "max_ngram": 17,
            "max_features": 10,
            "min_df": 5,
        },
    ]

    training_data, loaded_pipeline = train_and_preprocess(
        pipeline, "data/test_selectors")

    config_params = {
        EPOCHS: 5,
        MODEL_CONFIDENCE: "softmax",
        CONSTRAIN_SIMILARITIES: True,
        CHECKPOINT_MODEL: True,
        EVAL_NUM_EPOCHS: 1,
        EVAL_NUM_EXAMPLES: 10,
    }

    response_selector = create_response_selector(config_params)
    assert response_selector.component_config[CHECKPOINT_MODEL]

    resource = response_selector.train(training_data=training_data)

    with default_model_storage.read_from(resource) as model_dir:
        checkpoint_dir = model_dir / "checkpoints"
        assert checkpoint_dir.is_dir()
        checkpoint_files = list(checkpoint_dir.rglob("*.*"))
        """
        there should be min 2 `tf_model` files in the `checkpoints` directory:
        - tf_model.data
        - tf_model.index
        """
        assert len(checkpoint_files) >= 2
コード例 #14
0
 def load(
     cls,
     config: Dict[Text, Any],
     model_storage: ModelStorage,
     resource: Resource,
     execution_context: ExecutionContext,
 ) -> GraphComponent:
     """Loads trained component from disk."""
     try:
         with model_storage.read_from(resource) as model_dir:
             tfidfvectorizer = load(model_dir / "tfidfvectorizer.joblib")
             component = cls(config, execution_context.node_name,
                             model_storage, resource)
             component.tfm = tfidfvectorizer
     except (ValueError, FileNotFoundError):
         logger.debug(
             f"Couldn't load metadata for component '{cls.__name__}' as the persisted "
             f"model data couldn't be loaded.")
     return component
コード例 #15
0
ファイル: rule_only_provider.py プロジェクト: zoovu/rasa
    def create(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
    ) -> RuleOnlyDataProvider:
        """Creates component (see parent class for docstring)."""
        rule_only_data = {}
        try:
            with model_storage.read_from(resource) as directory:
                rule_only_data = rasa.shared.utils.io.read_json_file(
                    directory / "rule_only_data.json")
        except ValueError:
            logger.debug(
                "Failed to load rule-only data from a trained 'RulePolicy'. "
                "Providing empty rule-only data instead.")

        return cls(rule_only_data)
コード例 #16
0
def test_train_model_checkpointing(
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    default_model_storage: ModelStorage,
    train_and_preprocess: Callable[..., Tuple[TrainingData,
                                              List[GraphComponent]]],
):
    pipeline = [
        {
            "component": WhitespaceTokenizer
        },
        {
            "component": CountVectorsFeaturizer,
            "analyzer": "char_wb",
            "min_ngram": 3,
            "max_ngram": 17,
            "max_features": 10,
            "min_df": 5,
        },
    ]

    training_data, loaded_pipeline = train_and_preprocess(
        pipeline, "data/test_selectors")

    config_params = {
        EPOCHS: 2,
        MODEL_CONFIDENCE: "softmax",
        CONSTRAIN_SIMILARITIES: True,
        CHECKPOINT_MODEL: True,
        EVAL_NUM_EPOCHS: 1,
        EVAL_NUM_EXAMPLES: 10,
    }

    response_selector = create_response_selector(config_params)
    assert response_selector.component_config[CHECKPOINT_MODEL]

    resource = response_selector.train(training_data=training_data)

    with default_model_storage.read_from(resource) as model_dir:
        all_files = list(model_dir.rglob("*.*"))
        assert any(
            ["from_checkpoint" in str(filename) for filename in all_files])
コード例 #17
0
 def load(
     cls,
     config: Dict[Text, Any],
     model_storage: ModelStorage,
     resource: Resource,
     execution_context: ExecutionContext,
     **kwargs: Any,
 ) -> GraphComponent:
     """Loads trained component (see parent class for full docstring)."""
     try:
         with model_storage.read_from(resource) as model_dir:
             classifier = joblib.load(model_dir / f"{resource.name}.joblib")
             component = cls(config, execution_context.node_name,
                             model_storage, resource)
             component.clf = classifier
             return component
     except ValueError:
         logger.debug(
             f"Failed to load {cls.__class__.__name__} from model storage. Resource "
             f"'{resource.name}' doesn't exist.")
         return cls(config, model_storage, resource, execution_context)
コード例 #18
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> KeywordIntentClassifier:
        """Loads trained component (see parent class for full docstring)."""
        try:
            with model_storage.read_from(resource) as model_dir:
                keyword_file = model_dir / f"{cls.__name__}.json"
                intent_keyword_map = rasa.shared.utils.io.read_json_file(
                    keyword_file)
        except ValueError:
            logger.warning(
                f"Failed to load {cls.__class__.__name__} from model storage. Resource "
                f"'{resource.name}' doesn't exist.")
            intent_keyword_map = None

        return cls(config, model_storage, resource, execution_context,
                   intent_keyword_map)
コード例 #19
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> JiebaTokenizerGraphComponent:
        """Loads a custom dictionary from model storage."""
        dictionary_path = config["dictionary_path"]

        # If a custom dictionary path is in the config we know that it should have
        # been saved to the model storage.
        if dictionary_path is not None:
            try:
                with model_storage.read_from(resource) as resource_directory:
                    cls._load_custom_dictionary(str(resource_directory))
            except ValueError:
                logger.warning(
                    f"Failed to load {cls.__name__} from model storage. "
                    f"Resource '{resource.name}' doesn't exist.")
        return cls(config, model_storage, resource)
コード例 #20
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> CRFEntityExtractor:
        """Loads trained component (see parent class for full docstring)."""
        import joblib

        try:
            entity_taggers = OrderedDict()
            with model_storage.read_from(resource) as model_dir:
                # We have to load in the same order as we persisted things as otherwise
                # the predictions might be off
                file_names = sorted(model_dir.glob("**/*.pkl"))
                if not file_names:
                    logger.debug(
                        "Failed to load model for 'CRFEntityExtractor'. "
                        "Maybe you did not provide enough training data and "
                        "no model was trained."
                    )
                    return cls(config, model_storage, resource)

                for file_name in file_names:
                    name = file_name.stem[1:]
                    entity_taggers[name] = joblib.load(file_name)

                return cls(config, model_storage, resource, entity_taggers)
        except ValueError:
            logger.warning(
                f"Failed to load {cls.__name__} from model storage. Resource "
                f"'{resource.name}' doesn't exist."
            )
            return cls(config, model_storage, resource)
コード例 #21
0
async def test_train_model_checkpointing(
    default_model_storage: ModelStorage,
    default_diet_resource: Resource,
    create_train_load_and_process_diet: Callable[..., Message],
):
    create_train_load_and_process_diet(
        {EPOCHS: 2, EVAL_NUM_EPOCHS: 1, EVAL_NUM_EXAMPLES: 10, CHECKPOINT_MODEL: True},
    )

    with default_model_storage.read_from(default_diet_resource) as model_dir:
        checkpoint_dir = model_dir / "checkpoints"

        assert checkpoint_dir.is_dir()

        """
        Tricky to validate the *exact* number of files that should be there, however
        there must be at least the following:
            - metadata.json
            - checkpoint
            - component_1_CountVectorsFeaturizer (as per the pipeline above)
            - component_2_DIETClassifier files (more than 1 file)
        """
        all_files = list(model_dir.rglob("*.*"))
        assert len(all_files) > 4
コード例 #22
0
 def load(
     cls,
     config: Dict[Text, Any],
     model_storage: ModelStorage,
     resource: Resource,
     execution_context: ExecutionContext,
     **kwargs: Any,
 ) -> GraphComponent:
     """Loads a `FineTuningValidator` (see parent class for full docstring)."""
     try:
         with model_storage.read_from(resource) as path:
             fingerprints = rasa.shared.utils.io.read_json_file(
                 filename=path / cls.FILENAME, )
             return cls(
                 model_storage=model_storage,
                 execution_context=execution_context,
                 resource=resource,
                 fingerprints=fingerprints,
             )
     except ValueError as e:
         raise InvalidConfigException(
             f"Loading {cls.__name__} failed. Ensure that the {cls.__name__} "
             f"is part of your training graph and re-train your models before "
             f"attempting to use the {cls.__name__}.") from e
コード例 #23
0
def test_jieba_load_and_persist_dictionary(
    tmp_path_factory: TempPathFactory,
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    caplog: LogCaptureFixture,
):
    dictionary_directory = tmp_path_factory.mktemp("dictionaries")
    dictionary_path = dictionary_directory / "dictionary_1"

    dictionary_contents = """
创新办 3 i
云计算 5
凱特琳 nz
台中
        """
    dictionary_path.write_text(dictionary_contents, encoding="utf-8")

    component_config = {"dictionary_path": dictionary_directory}

    resource = Resource("jieba")
    tk = JiebaTokenizerGraphComponent.create(
        {
            **JiebaTokenizerGraphComponent.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process_training_data(TrainingData([Message(data={TEXT: ""})]))

    # The dictionary has not been persisted yet.
    with caplog.at_level(logging.WARN):
        JiebaTokenizerGraphComponent.load(
            {
                **JiebaTokenizerGraphComponent.get_default_config(),
                **component_config
            },
            default_model_storage,
            resource,
            default_execution_context,
        )
        assert any(
            "Failed to load JiebaTokenizerGraphComponent from model storage."
            in message for message in caplog.messages)

    tk.persist()

    # Check the persisted dictionary matches the original file.
    with default_model_storage.read_from(resource) as resource_dir:
        contents = (resource_dir / "dictionary_1").read_text(encoding="utf-8")
        assert contents == dictionary_contents

    # Delete original files to show that we read from the model storage.
    dictionary_path.unlink()
    dictionary_directory.rmdir()

    JiebaTokenizerGraphComponent.load(
        {
            **JiebaTokenizerGraphComponent.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process([Message(data={TEXT: ""})])
コード例 #24
0
def test_nlu_training_data_provider(
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    config_path: Text,
    nlu_data_path: Text,
):
    # create a resource and an importer
    resource = Resource("xy")
    importer = TrainingDataImporter.load_from_config(
        config_path=config_path, training_data_paths=[nlu_data_path])

    # check the default configuration is as expected
    config_1 = NLUTrainingDataProvider.get_default_config()
    assert config_1["language"] is None
    assert config_1["persist"] is False

    # create a provider with persist == True
    provider_1 = NLUTrainingDataProvider.create(
        {
            "language": "en",
            "persist": True
        },
        default_model_storage,
        resource,
        default_execution_context,
    )
    assert isinstance(provider_1, NLUTrainingDataProvider)

    # check the data provided is as expected
    data_0 = provider_1.provide(importer)
    data_1 = importer.get_nlu_data(language="en")
    assert data_0.fingerprint() == data_1.fingerprint()

    # check the data was persisted
    with default_model_storage.read_from(resource) as resource_directory:
        data_file = os.path.join(str(resource_directory),
                                 DEFAULT_TRAINING_DATA_OUTPUT_PATH)
        data = load_data(resource_name=data_file, language="en")
        assert os.path.isfile(data_file)
        assert isinstance(data, TrainingData)

        # delete the persisted data
        os.remove(data_file)
        assert not os.path.isfile(data_file)

    # create a provider with persist == False
    provider_2 = NLUTrainingDataProvider.create(
        {
            "language": "en",
            "persist": False
        },
        default_model_storage,
        resource,
        default_execution_context,
    )
    provider_2.provide(importer)

    # check the data was not persisted
    with default_model_storage.read_from(resource) as resource_directory:
        data_file = os.path.join(str(resource_directory),
                                 DEFAULT_TRAINING_DATA_OUTPUT_PATH)
        assert not os.path.isfile(data_file)