def tok(**kwargs): input_dir = Path(kwargs["input_dir"]) dialect_name = kwargs["dialect_name"] dialect_path = kwargs["dialect_path"] # overwrite = kwargs["overwrite"] rebuild = kwargs["rebuild_trie"] # load botok config if dialect_name: config = Config(dialect_name=dialect_name) save_config(config.dialect_pack_path) elif dialect_path: config = Config.from_path(dialect_path) # config.dialect_pack_path = Path(dialect_pack_path) save_config(config.dialect_pack_path) else: pybo_config = load_config() if not pybo_config: config = Config() save_config(config.dialect_pack_path) else: dialect_pack_path = pybo_config["dialect_pack_path"] config = Config.from_path(dialect_pack_path) print( f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..." ) wt = WordTokenizer(config=config, build_trie=rebuild) def pybo_tok(in_str): return wt.tokenize(in_str) # Select and Order the tags if kwargs["tags"]: pybo_mod.__defaults__ = (list(kwargs["tags"]), ) if input_dir.is_dir(): if kwargs["o"] is not None: output_dir = Path(kwargs["o"]) else: output_dir = input_dir.parent / (input_dir.name + "_tok") output_dir.mkdir(exist_ok=True) for f in input_dir.glob("*.txt"): out_file = output_dir / (f.stem + "_tok.txt") text = Text(f, out_file) text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) elif input_dir.is_file(): input_file = input_dir if kwargs["o"] is not None: output_dir = Path(kwargs["o"]) else: output_dir = input_file.parent / (input_file.stem + "_tok") output_dir.mkdir(exist_ok=True) out_file = output_dir / (input_file.stem + "_tok.txt") text = Text(input_file, out_file) text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) else: print("[INFO] Invalid input directory or file!!!")
def test_multiple_words_per_entry(): profile = "POS" config = Config.from_path("./tests/data/trie_dialect_pack") bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) res = bt.has_word(syls("ལྟར་")) assert { "lemma": "ལྟ་", "pos": "VERB", "freq": 123, "affixed": True } in res["data"]["senses"] assert { "lemma": "ལྟར་", "pos": "ADV", "freq": 456, "affixed": False } in res["data"]["senses"]
def test_empty_config(): config = Config.from_path("./tests/data/empty_dialect_pack") assert not config.dictionary assert not config.adjustments
from pathlib import Path from botok import WordTokenizer, Text, Config ########################################### in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།" WT = WordTokenizer() tokens = WT.tokenize(in_str) in_str = "ལ་པོ་ལ་པོ་ལ་པོ་" t = Text(in_str, tok_params={"config": Config()}) tokens = t.tokenize_words_raw_text tt = Text( in_str, tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")}, ) ttokens = tt.tokenize_words_raw_text print(tokens) print(ttokens) ########################################### # # ### Extract token-string / POS pairs ######## # # tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens] # print(', '.join(tagged)) # # # ### Extract the cleaned version of the tokens # # cleaned = [w.text_cleaned for w in tokens if w.text_cleaned]
def test_createtrie(): profile = "empty" config = Config.from_path("./tests/data/trie_dialect_pack") bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) # the trie works as expected. but the add() method should never be used directly: # it does not inflect entries, so the tokenizer won't work as expected. # be careful only to use it with words that can't ever be inflected, like case particles. bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"}) assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": False, "data": { "_": {} } } # use inflect_n_modify_trie() instead, to add entries bt.inflect_n_modify_trie("གྲུབ་མཐའ་") assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 2, "type": "gi", "aa": True } }, } bt.inflect_n_modify_trie("ཀ་ར་", skrt=True) assert bt.has_word(syls("ཀ་རར་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 1, "type": "la", "aa": False }, "skrt": True, "senses": [{ "lemma": "", "affixed": True }], }, } # arrives here because skrt was True bt.inflect_n_add_data( "གྲུབ་མཐའ་\t\t\t\t532" ) # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 2, "type": "gi", "aa": True }, "senses": [{ "freq": 532, "affixed": True }], }, } # freq is an int # just like add() was not meant to be used directly, deactivate() is not # instead, use bt.inflect_n_modify_trie("word", deactivate=True) bt.deactivate(syls("ཀ་ར་")) assert (bt.has_word(syls("ཀ་ར་"))["exists"] is False ) # since 'ཀ་ར་' has been deactivated
def empty_wt(): """Return empty word tokenizer.""" config = Config.from_path("./tests/data/empty_dialect_pack") return WordTokenizer(config=config)