Ejemplo n.º 1
0
Archivo: cli.py Proyecto: Esukhia/pybo
def tok(**kwargs):
    input_dir = Path(kwargs["input_dir"])
    dialect_name = kwargs["dialect_name"]
    dialect_path = kwargs["dialect_path"]
    # overwrite = kwargs["overwrite"]
    rebuild = kwargs["rebuild_trie"]

    # load botok config
    if dialect_name:
        config = Config(dialect_name=dialect_name)
        save_config(config.dialect_pack_path)
    elif dialect_path:
        config = Config.from_path(dialect_path)
        # config.dialect_pack_path = Path(dialect_pack_path)
        save_config(config.dialect_pack_path)
    else:
        pybo_config = load_config()
        if not pybo_config:
            config = Config()
            save_config(config.dialect_pack_path)
        else:
            dialect_pack_path = pybo_config["dialect_pack_path"]
            config = Config.from_path(dialect_pack_path)

    print(
        f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..."
    )

    wt = WordTokenizer(config=config, build_trie=rebuild)

    def pybo_tok(in_str):
        return wt.tokenize(in_str)

    # Select and Order the tags
    if kwargs["tags"]:
        pybo_mod.__defaults__ = (list(kwargs["tags"]), )

    if input_dir.is_dir():
        if kwargs["o"] is not None:
            output_dir = Path(kwargs["o"])
        else:
            output_dir = input_dir.parent / (input_dir.name + "_tok")
            output_dir.mkdir(exist_ok=True)
        for f in input_dir.glob("*.txt"):
            out_file = output_dir / (f.stem + "_tok.txt")
            text = Text(f, out_file)
            text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
    elif input_dir.is_file():
        input_file = input_dir
        if kwargs["o"] is not None:
            output_dir = Path(kwargs["o"])
        else:
            output_dir = input_file.parent / (input_file.stem + "_tok")
            output_dir.mkdir(exist_ok=True)
        out_file = output_dir / (input_file.stem + "_tok.txt")
        text = Text(input_file, out_file)
        text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
    else:
        print("[INFO] Invalid input directory or file!!!")
Ejemplo n.º 2
0
def test_multiple_words_per_entry():
    profile = "POS"
    config = Config.from_path("./tests/data/trie_dialect_pack")
    bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)

    res = bt.has_word(syls("ལྟར་"))
    assert {
        "lemma": "ལྟ་",
        "pos": "VERB",
        "freq": 123,
        "affixed": True
    } in res["data"]["senses"]
    assert {
        "lemma": "ལྟར་",
        "pos": "ADV",
        "freq": 456,
        "affixed": False
    } in res["data"]["senses"]
Ejemplo n.º 3
0
def test_empty_config():
    config = Config.from_path("./tests/data/empty_dialect_pack")

    assert not config.dictionary
    assert not config.adjustments
Ejemplo n.º 4
0
from pathlib import Path

from botok import WordTokenizer, Text, Config

###########################################
in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"
WT = WordTokenizer()
tokens = WT.tokenize(in_str)

in_str = "ལ་པོ་ལ་པོ་ལ་པོ་"
t = Text(in_str, tok_params={"config": Config()})
tokens = t.tokenize_words_raw_text
tt = Text(
    in_str,
    tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")},
)
ttokens = tt.tokenize_words_raw_text
print(tokens)
print(ttokens)
###########################################

#
# ### Extract token-string / POS pairs ########
#
# tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens]
# print(', '.join(tagged))
#
#
# ### Extract the cleaned version of the tokens
#
# cleaned = [w.text_cleaned for w in tokens if w.text_cleaned]
Ejemplo n.º 5
0
def test_createtrie():
    profile = "empty"
    config = Config.from_path("./tests/data/trie_dialect_pack")
    bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)

    # the trie works as expected. but the add() method should never be used directly:
    # it does not inflect entries, so the tokenizer won't work as expected.
    # be careful only to use it with words that can't ever be inflected, like case particles.
    bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"})
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": False,
        "data": {
            "_": {}
        }
    }

    # use inflect_n_modify_trie() instead, to add entries
    bt.inflect_n_modify_trie("གྲུབ་མཐའ་")
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 2,
                "type": "gi",
                "aa": True
            }
        },
    }

    bt.inflect_n_modify_trie("ཀ་ར་", skrt=True)
    assert bt.has_word(syls("ཀ་རར་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 1,
                "type": "la",
                "aa": False
            },
            "skrt": True,
            "senses": [{
                "lemma": "",
                "affixed": True
            }],
        },
    }  # arrives here because skrt was True

    bt.inflect_n_add_data(
        "གྲུབ་མཐའ་\t\t\t\t532"
    )  # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 2,
                "type": "gi",
                "aa": True
            },
            "senses": [{
                "freq": 532,
                "affixed": True
            }],
        },
    }  # freq is an int

    # just like add() was not meant to be used directly, deactivate() is not
    # instead, use bt.inflect_n_modify_trie("word", deactivate=True)
    bt.deactivate(syls("ཀ་ར་"))
    assert (bt.has_word(syls("ཀ་ར་"))["exists"] is False
            )  # since 'ཀ་ར་' has been deactivated
Ejemplo n.º 6
0
def empty_wt():
    """Return empty word tokenizer."""
    config = Config.from_path("./tests/data/empty_dialect_pack")
    return WordTokenizer(config=config)