def test_merge(merger): tmp_dir = tempfile.mkdtemp() try: file_1 = path.join(tmp_dir, 'test_merger_1.kv') file_2 = path.join(tmp_dir, 'test_merger_2.kv') file_3 = path.join(tmp_dir, 'test_merger_3.kv') merge_file = path.join(tmp_dir, 'merge.kv') generate_keyvi(keys_1, file_1) generate_keyvi(keys_2, file_2) generate_keyvi(keys_3, file_3) merger.Add(file_1) merger.Add(file_2) merger.Add(file_3) merger.Merge(merge_file) merged_dictionary = Dictionary(merge_file) keys = set() keys.update(keys_1) keys.update(keys_2) keys.update(keys_3) keys_ordered = sorted(keys) for base_key, keyvi_key in zip(keys_ordered, merged_dictionary.GetAllKeys()): assert base_key == keyvi_key finally: shutil.rmtree(tmp_dir)
def test_manifest_for_merger(): try: c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", '{"a" : 2}') c.Compile() c.SetManifest('{"author": "Zapp Brannigan"}') c.WriteToFile('manifest_json_merge1.kv') del c c2 = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c2.Add("abd", '{"a" : 3}') c2.Compile() c2.SetManifest('{"author": "Leela"}') c2.WriteToFile('manifest_json_merge2.kv') del c2 merger = JsonDictionaryMerger({"memory_limit_mb": "10"}) merger.SetManifest('{"author": "Fry"}') merger.Merge('manifest_json_merged.kv') d = Dictionary('manifest_json_merged.kv') m = json.loads(d.GetManifest()) assert m['author'] == "Fry" del d finally: os.remove('manifest_json_merge1.kv') os.remove('manifest_json_merge2.kv') os.remove('manifest_json_merged.kv')
def test_merge(merger): tmp_dir = tempfile.mkdtemp() try: file_1 = path.join(tmp_dir, 'test_merger_1.kv') file_2 = path.join(tmp_dir, 'test_merger_2.kv') file_3 = path.join(tmp_dir, 'test_merger_3.kv') merge_file = path.join(tmp_dir, 'merge.kv') generate_keyvi(key_values_1, file_1) generate_keyvi(key_values_2, file_2) generate_keyvi(key_values_3, file_3) merger.Add(file_1) merger.Add(file_2) merger.Add(file_3) merger.Merge(merge_file) merged_dictionary = Dictionary(merge_file) key_values = {} key_values.update(key_values_1) key_values.update(key_values_2) key_values.update(key_values_3) key_values_ordered = collections.OrderedDict(sorted(key_values.items())) for (base_key, base_value), (keyvi_key, keyvi_value) in zip(key_values_ordered.items(), merged_dictionary.GetAllItems()): assert base_key == keyvi_key assert base_value == keyvi_value finally: shutil.rmtree(tmp_dir)
def test_truncated_file_json(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv')) size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv')) fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb') fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb') fd.write(fd_in.read(int(size / 2))) fd.close() fd2 = open(os.path.join(tmp_dir, 'truncation_test2.kv'), 'wb') fd2.write(fd_in.read(int(size - 2))) fd2.close() with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv')) with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'truncation_test2.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test2.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test1.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
def dump(args): dictionary = Dictionary(args.input_file) with open(args.output_file, 'w') as file_out: for key, value in dictionary.GetAllItems(): if args.json_dumps: key = json.dumps(key) if isinstance(key, bytes): key = key.decode() file_out.write(key) if value: if args.json_dumps: value = json.dumps(value) file_out.write('\t{}'.format(value)) file_out.write('\n')
def test_manifest_after_compile(): c = KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("Leela") c.Add("Kif") c.Compile() c.SetManifest('{"author": "Zapp Brannigan"}') file_name = os.path.join(tempfile.gettempdir(), 'brannigan_manifest2.kv') try: c.WriteToFile(file_name) d = Dictionary(file_name) m = json.loads(d.GetManifest()) assert m['author'] == "Zapp Brannigan" del d finally: os.remove(file_name)
def test_invalid_filemagic(): fd = open(os.path.join(tmp_dir, 'broken_file'), 'w') fd.write('dead beef') fd.close() exception_caught = False with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'broken_file')) os.remove(os.path.join(tmp_dir, 'broken_file'))
def tmp_dictionary(compiler, file_name): tmp_dir = tempfile.gettempdir() fq_file_name = os.path.join(tmp_dir, file_name) compiler.Compile() compiler.WriteToFile(fq_file_name) del compiler d = Dictionary(fq_file_name) yield d del d os.remove(fq_file_name)
from keyvi.dictionary import Dictionary query = "" d = Dictionary("your-own.kv") def get_lookup_key(query): return query while query != "exit": query = raw_input("Query:") for m in d.Get(get_lookup_key(query.strip())): print("{} {}".format(m.GetMatchedString(), m.GetValueAsString()))
from keyvi.dictionary import Dictionary from keyvi.completion import PrefixCompletion query = "" d = Dictionary("prefix-completion.kv") c = PrefixCompletion(d) def get_lookup_key(query): return query while query != "exit": query = str(input("Query:")) for m in c.GetFuzzyCompletions(get_lookup_key(query.strip()), 3): print("{} {}".format(m.GetMatchedString(), m.GetAttribute("weight")))
def stats(input_file): print( json.dumps(Dictionary(input_file).GetStatistics(), indent=4, sort_keys=True))
from keyvi.dictionary import Dictionary from keyvi.completion import MultiWordCompletion MULTIWORD_QUERY_SEPARATOR = '\x1b' query = "" d=Dictionary("mw-completion.kv") c=MultiWordCompletion(d) def get_lookup_key(query): l = query.split(" ") l_bow = " ".join(sorted(l[:-1]) + l[-1:]) return l_bow while query!="exit": query = str(input("Query:")) for m in c.GetCompletions(get_lookup_key(query.strip())): print("{} {}".format(m.GetMatchedString(), m.GetAttribute("weight")))
import sys from keyvi.dictionary import Dictionary from keyvi.util import FsaTransform d = Dictionary("normalization.kv") n = FsaTransform(d) for line in sys.stdin: print(n.Normalize(line))
from keyvi.dictionary import Dictionary query = "" d = Dictionary("cities.kv") def get_lookup_key(query): return query while query != "exit": query = raw_input("Query:") for m in d.LookupText(get_lookup_key(query.strip())): print("{}".format(m.GetMatchedString()))
def test_non_existing_file(): assert os.path.exists('non_existing_file') == False with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'non_existing_file'))
def mem(): D = Dictionary("_temp/kv/g_word_utf8.txt") print(type(D))