Esempio n. 1
0
def test_jieba_load_and_persist_dictionary(
    tmp_path_factory: TempPathFactory,
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    caplog: LogCaptureFixture,
):
    dictionary_directory = tmp_path_factory.mktemp("dictionaries")
    dictionary_path = dictionary_directory / "dictionary_1"

    dictionary_contents = """
创新办 3 i
云计算 5
凱特琳 nz
台中
        """
    dictionary_path.write_text(dictionary_contents, encoding="utf-8")

    component_config = {"dictionary_path": dictionary_directory}

    resource = Resource("jieba")
    tk = JiebaTokenizer.create(
        {
            **JiebaTokenizer.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process_training_data(TrainingData([Message(data={TEXT: ""})]))

    # The dictionary has not been persisted yet.
    with caplog.at_level(logging.DEBUG):
        JiebaTokenizer.load(
            {
                **JiebaTokenizer.get_default_config(),
                **component_config
            },
            default_model_storage,
            resource,
            default_execution_context,
        )
        assert any(
            "Failed to load JiebaTokenizer from model storage." in message
            for message in caplog.messages)

    tk.persist()

    # Check the persisted dictionary matches the original file.
    with default_model_storage.read_from(resource) as resource_dir:
        contents = (resource_dir / "dictionary_1").read_text(encoding="utf-8")
        assert contents == dictionary_contents

    # Delete original files to show that we read from the model storage.
    dictionary_path.unlink()
    dictionary_directory.rmdir()

    JiebaTokenizer.load(
        {
            **JiebaTokenizer.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process([Message(data={TEXT: ""})])
Esempio n. 2
0
    context = {}
    time = datetime.datetime.now()
    default_output_attributes = {
        "intent": {
            "name": None,
            "confidence": 0.0
        },
        "entities": []
    }
    message = Message(text, data=default_output_attributes, time=time)

    model_dir = './models/link/model_20190517-113416'
    model_metadata = Metadata.load(model_dir)

    jieba_meta = model_metadata.for_component(index=0)
    jie = JiebaTokenizer.load(meta=jieba_meta,
                              model_dir=model_dir,
                              model_metadata=Metadata.load(model_dir))

    pprint.pprint(message.data)
    jie.process(message)

    ltp_meta = model_metadata.for_component(index=5)
    ltp = LtpHelper.load(meta=ltp_meta,
                         model_dir=model_dir,
                         model_metadata=Metadata.load(model_dir))

    pprint.pprint(message.data)
    ltp.process(message)
    pprint.pprint(message.data)