Esempio n. 1
0
def create_jieba(config: Optional[Dict] = None) -> JiebaTokenizer:
    config = config if config else {}
    return JiebaTokenizer.create(
        {
            **JiebaTokenizer.get_default_config(),
            **config
        }, None, None, None)
Esempio n. 2
0
def test_jieba(text, expected_tokens, expected_indices):
    tk = JiebaTokenizer()

    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Esempio n. 3
0
def test_jieba_load_dictionary(tmpdir_factory):
    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath

    component_config = {"dictionary_path": dictionary_path}

    with patch.object(JiebaTokenizer,
                      "load_custom_dictionary",
                      return_value=None) as mock_method:
        tk = JiebaTokenizer(component_config)
        tk.tokenize(Message(""), attribute=TEXT_ATTRIBUTE)

    mock_method.assert_called_once_with(dictionary_path)
Esempio n. 4
0
def test_jieba_load_dictionary(tmp_path: Path):
    dictionary_path = str(tmp_path)

    component_config = {"dictionary_path": dictionary_path}

    with patch.object(JiebaTokenizer,
                      "load_custom_dictionary",
                      return_value=None) as mock_method:
        tk = JiebaTokenizer(component_config)
        tk.tokenize(Message(data={TEXT: ""}), attribute=TEXT)

    mock_method.assert_called_once_with(dictionary_path)
Esempio n. 5
0
def test_jieba_load_dictionary(tmpdir_factory):
    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer

    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath

    component_config = {"dictionary_path": dictionary_path}

    with patch.object(JiebaTokenizer,
                      "load_custom_dictionary",
                      return_value=None) as mock_method:
        tk = JiebaTokenizer(component_config)
        tk.tokenize("")

    mock_method.assert_called_once_with(dictionary_path)
Esempio n. 6
0
def test_custom_intent_symbol(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    tk = JiebaTokenizer(component_config)

    message = Message(text)
    message.set(INTENT_ATTRIBUTE, text)

    tk.train(TrainingData([message]))

    assert [t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
            ] == expected_tokens
Esempio n. 7
0
def test_jieba_add_cls_token():
    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer

    component_config = {"use_cls_token": True}

    tk = JiebaTokenizer(component_config)

    assert [t.text for t in tk.tokenize("Micheal你好吗?")] == [
        "Micheal",
        "你好",
        "吗",
        "?",
        CLS_TOKEN,
    ]

    assert [t.offset for t in tk.tokenize("Micheal你好吗?")] == [0, 7, 9, 10, 12]
Esempio n. 8
0
 def __init__(self,
              component_config: Optional[Dict[Text, Any]] = None) -> None:
     super(HFTransformersNLP, self).__init__(component_config)
     self._load_model()
     self.whitespace_tokenizer = WhitespaceTokenizer()
     self.jieba_tokenizer = None
     if 'semi-lm' in component_config and component_config['semi-lm'] == 1:
         self.jieba_tokenizer = JiebaTokenizer(component_config)
         self.tokenizer.do_basic_tokenize = False
Esempio n. 9
0
def test_jieba():
    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer

    tk = JiebaTokenizer()

    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]

    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]

    assert [t.text for t in tk.tokenize("Micheal你好吗?")] == ["Micheal", "你好", "吗", "?"]

    assert [t.offset for t in tk.tokenize("Micheal你好吗?")] == [0, 7, 9, 10]
Esempio n. 10
0
def test_jieba():
    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer

    component_config = {"use_cls_token": False}

    tk = JiebaTokenizer(component_config)

    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]

    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]

    assert [t.text for t in tk.tokenize("Micheal你好吗?")] == ["Micheal", "你好", "吗", "?"]

    assert [t.offset for t in tk.tokenize("Micheal你好吗?")] == [0, 7, 9, 10]
Esempio n. 11
0
def test_jieba():
    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
    tk = JiebaTokenizer()

    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == \
           ['我', '想', '去', '吃', '兰州', '拉面']

    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == \
           [0, 1, 2, 3, 4, 6]

    assert [t.text for t in tk.tokenize("Micheal你好吗?")] == \
           ['Micheal', '你好', '吗', '?']

    assert [t.offset for t in tk.tokenize("Micheal你好吗?")] == \
           [0, 7, 9, 10]
Esempio n. 12
0
def test_jieba_load_and_persist_dictionary(
    tmp_path_factory: TempPathFactory,
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    caplog: LogCaptureFixture,
):
    dictionary_directory = tmp_path_factory.mktemp("dictionaries")
    dictionary_path = dictionary_directory / "dictionary_1"

    dictionary_contents = """
创新办 3 i
云计算 5
凱特琳 nz
台中
        """
    dictionary_path.write_text(dictionary_contents, encoding="utf-8")

    component_config = {"dictionary_path": dictionary_directory}

    resource = Resource("jieba")
    tk = JiebaTokenizer.create(
        {
            **JiebaTokenizer.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process_training_data(TrainingData([Message(data={TEXT: ""})]))

    # The dictionary has not been persisted yet.
    with caplog.at_level(logging.DEBUG):
        JiebaTokenizer.load(
            {
                **JiebaTokenizer.get_default_config(),
                **component_config
            },
            default_model_storage,
            resource,
            default_execution_context,
        )
        assert any(
            "Failed to load JiebaTokenizer from model storage." in message
            for message in caplog.messages)

    tk.persist()

    # Check the persisted dictionary matches the original file.
    with default_model_storage.read_from(resource) as resource_dir:
        contents = (resource_dir / "dictionary_1").read_text(encoding="utf-8")
        assert contents == dictionary_contents

    # Delete original files to show that we read from the model storage.
    dictionary_path.unlink()
    dictionary_directory.rmdir()

    JiebaTokenizer.load(
        {
            **JiebaTokenizer.get_default_config(),
            **component_config
        },
        default_model_storage,
        resource,
        default_execution_context,
    )

    tk.process([Message(data={TEXT: ""})])
Esempio n. 13
0
    context = {}
    time = datetime.datetime.now()
    default_output_attributes = {
        "intent": {
            "name": None,
            "confidence": 0.0
        },
        "entities": []
    }
    message = Message(text, data=default_output_attributes, time=time)

    model_dir = './models/link/model_20190517-113416'
    model_metadata = Metadata.load(model_dir)

    jieba_meta = model_metadata.for_component(index=0)
    jie = JiebaTokenizer.load(meta=jieba_meta,
                              model_dir=model_dir,
                              model_metadata=Metadata.load(model_dir))

    pprint.pprint(message.data)
    jie.process(message)

    ltp_meta = model_metadata.for_component(index=5)
    ltp = LtpHelper.load(meta=ltp_meta,
                         model_dir=model_dir,
                         model_metadata=Metadata.load(model_dir))

    pprint.pprint(message.data)
    ltp.process(message)
    pprint.pprint(message.data)