def test_config(): config = Config() # default config filename assert config.filename.name == "botok.yaml" # config.filename is a Path object # paths for trie content main, custom = config.get_tok_data_paths("POS") # each profile contains one or more sections assert [m for m in main] == ["words", "words_non_inflected"] # each element in a Path object leading to a resource file assert isinstance(main["words"][0], Path) # custom files to overwrite the existing trie can be added as follows modif_path = "trie_data/" assert not len(custom) main, custom = config.get_tok_data_paths("POS", modifs=modif_path) expected = sorted(["words", "words_skrt"]) assert expected == sorted([c for c in custom]) expected1 = sorted(["adjustment", "remove", "words", "words_skrt"]) assert expected1 == sorted( [t.parts[-1] for t in Path(modif_path).glob("*")]) # overwriting the main profile main, custom = config.get_tok_data_paths(modif_path, mode="custom") expected = sorted(["words", "words_non_inflected", "words_skrt"]) assert expected == sorted([m for m in main])
def tok(**kwargs): input_dir = Path(kwargs["input_dir"]) dialect_name = kwargs["dialect_name"] dialect_path = kwargs["dialect_path"] # overwrite = kwargs["overwrite"] rebuild = kwargs["rebuild_trie"] # load botok config if dialect_name: config = Config(dialect_name=dialect_name) save_config(config.dialect_pack_path) elif dialect_path: config = Config.from_path(dialect_path) # config.dialect_pack_path = Path(dialect_pack_path) save_config(config.dialect_pack_path) else: pybo_config = load_config() if not pybo_config: config = Config() save_config(config.dialect_pack_path) else: dialect_pack_path = pybo_config["dialect_pack_path"] config = Config.from_path(dialect_pack_path) print( f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..." ) wt = WordTokenizer(config=config, build_trie=rebuild) def pybo_tok(in_str): return wt.tokenize(in_str) # Select and Order the tags if kwargs["tags"]: pybo_mod.__defaults__ = (list(kwargs["tags"]), ) if input_dir.is_dir(): if kwargs["o"] is not None: output_dir = Path(kwargs["o"]) else: output_dir = input_dir.parent / (input_dir.name + "_tok") output_dir.mkdir(exist_ok=True) for f in input_dir.glob("*.txt"): out_file = output_dir / (f.stem + "_tok.txt") text = Text(f, out_file) text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) elif input_dir.is_file(): input_file = input_dir if kwargs["o"] is not None: output_dir = Path(kwargs["o"]) else: output_dir = input_file.parent / (input_file.stem + "_tok") output_dir.mkdir(exist_ok=True) out_file = output_dir / (input_file.stem + "_tok.txt") text = Text(input_file, out_file) text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) else: print("[INFO] Invalid input directory or file!!!")
def test_reset(base_path): custome_pack_name = "kangyur" config = Config(dialect_name=custome_pack_name) assert config.dialect_pack_path == base_path / custome_pack_name config.reset() assert config.dialect_pack_path == base_path / "general"
def test_add_dialect_pack(): config = Config() old_dictionary = copy.deepcopy(config.dictionary) old_adjustments = copy.deepcopy(config.adjustments) config.add_dialect_pack(Path("./tests/data/trie_dialect_pack")) assert config.dictionary != old_dictionary assert config.adjustments != old_adjustments
def test_syl_tokenize(): instr = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ" preprocessed = TokChunks(instr) preprocessed.serve_syls_to_trie() config = Config() trie = Trie(BoSyl, config.profile, config.dictionary, config.adjustments) tok = Tokenize(trie) tokens = tok.tokenize(preprocessed) texts = [t.text for t in tokens] expected = [ " མཐའི་", "རྒྱ་མཚོའི་", "གླིང་", "། ", "ཤི་", "བཀྲ་ཤིས་ ", "tr ", "བདེ་་ལེ གས", "། ", "བཀྲ་ཤིས་", "བདེ་ལེགས་", "ཀཀ", ] # current: [' མཐའི་', 'རྒྱ་མཚོའི་', '། ', 'གླིང་', 'བཀྲ་', 'ཤི་', 'tr ', 'ཤིས་ ', 'བདེ་་ལེ གས', '། ', 'བདེ་', # 'བཀྲ་ཤིས་', 'ཀཀ', 'ལེགས་'] assert texts == expected
def test_multiple_words_per_entry(): profile = "POS" config = Config.from_path("./tests/data/trie_dialect_pack") bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) res = bt.has_word(syls("ལྟར་")) assert { "lemma": "ལྟ་", "pos": "VERB", "freq": 123, "affixed": True } in res["data"]["senses"] assert { "lemma": "ལྟར་", "pos": "ADV", "freq": 456, "affixed": False } in res["data"]["senses"]
def test_defaults(base_path): config = Config() # default dialect pach path assert config.dialect_pack_path == base_path / "general" assert config.dialect_pack_path.is_dir() # Trie data should be .tsv file for data_type in ["words", "rules"]: assert data_type in config.dictionary for data_fn in config.dictionary[data_type]: assert data_fn.suffix == ".tsv" # Segmentation adjustment for data_type in ["remove", "rules", "words", "words_skrt"]: assert data_type in config.adjustments for data_fn in config.adjustments[data_type]: if data_fn.suffix: assert data_fn.suffix == ".tsv"
def test_empty_config(): config = Config.from_path("./tests/data/empty_dialect_pack") assert not config.dictionary assert not config.adjustments
def test_custome_dialect_pack(base_path): config = Config(dialect_name="kangyur") assert config.dialect_pack_path == base_path / "kangyur" assert config.dialect_pack_path.is_dir()
from pathlib import Path from botok import WordTokenizer, Text, Config ########################################### in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།" WT = WordTokenizer() tokens = WT.tokenize(in_str) in_str = "ལ་པོ་ལ་པོ་ལ་པོ་" t = Text(in_str, tok_params={"config": Config()}) tokens = t.tokenize_words_raw_text tt = Text( in_str, tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")}, ) ttokens = tt.tokenize_words_raw_text print(tokens) print(ttokens) ########################################### # # ### Extract token-string / POS pairs ######## # # tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens] # print(', '.join(tagged)) # # # ### Extract the cleaned version of the tokens # # cleaned = [w.text_cleaned for w in tokens if w.text_cleaned]
def test_adj_config(): config = Config() modif_path = "trie_data" main, custom = config.get_adj_data_paths("basic", modifs=modif_path) assert "rdr_basis.tsv" == "".join([m.name for m in main]) assert "test.tsv" == "".join([c.name for c in custom])
def test_createtrie(): profile = "empty" config = Config.from_path("./tests/data/trie_dialect_pack") bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) # the trie works as expected. but the add() method should never be used directly: # it does not inflect entries, so the tokenizer won't work as expected. # be careful only to use it with words that can't ever be inflected, like case particles. bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"}) assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": False, "data": { "_": {} } } # use inflect_n_modify_trie() instead, to add entries bt.inflect_n_modify_trie("གྲུབ་མཐའ་") assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 2, "type": "gi", "aa": True } }, } bt.inflect_n_modify_trie("ཀ་ར་", skrt=True) assert bt.has_word(syls("ཀ་རར་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 1, "type": "la", "aa": False }, "skrt": True, "senses": [{ "lemma": "", "affixed": True }], }, } # arrives here because skrt was True bt.inflect_n_add_data( "གྲུབ་མཐའ་\t\t\t\t532" ) # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": True, "data": { "_": {}, "affixation": { "len": 2, "type": "gi", "aa": True }, "senses": [{ "freq": 532, "affixed": True }], }, } # freq is an int # just like add() was not meant to be used directly, deactivate() is not # instead, use bt.inflect_n_modify_trie("word", deactivate=True) bt.deactivate(syls("ཀ་ར་")) assert (bt.has_word(syls("ཀ་ར་"))["exists"] is False ) # since 'ཀ་ར་' has been deactivated
# coding: utf8 from collections import defaultdict from pathlib import Path from botok import BoSyl, Config, TokChunks, Trie config = Config() def syls(string): return TokChunks(string).get_syls() def test_createtrie(): profile = "empty" config = Config.from_path("./tests/data/trie_dialect_pack") bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) # the trie works as expected. but the add() method should never be used directly: # it does not inflect entries, so the tokenizer won't work as expected. # be careful only to use it with words that can't ever be inflected, like case particles. bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"}) assert bt.has_word(syls("གྲུབ་མཐའི་")) == { "exists": False, "data": { "_": {} } } # use inflect_n_modify_trie() instead, to add entries bt.inflect_n_modify_trie("གྲུབ་མཐའ་")
def empty_wt(): """Return empty word tokenizer.""" config = Config.from_path("./tests/data/empty_dialect_pack") return WordTokenizer(config=config)