def test_manifest_for_merger(): try: c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", '{"a" : 2}') c.Compile() c.SetManifest({"author": "Zapp Brannigan"}) c.WriteToFile('manifest_json_merge1.kv') del c c2 = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c2.Add("abd", '{"a" : 3}') c2.Compile() c2.SetManifest({"author": "Leela"}) c2.WriteToFile('manifest_json_merge2.kv') del c2 merger = pykeyvi.JsonDictionaryMerger({"memory_limit_mb": "10"}) merger.SetManifest({"author": "Fry"}) merger.Merge('manifest_json_merged.kv') d = pykeyvi.Dictionary('manifest_json_merged.kv') m = d.GetManifest() assert m['author'] == "Fry" del d finally: os.remove('manifest_json_merge1.kv') os.remove('manifest_json_merge2.kv') os.remove('manifest_json_merged.kv')
def test_float_compaction(): cs = pykeyvi.JsonDictionaryCompiler({ "memory_limit_mb": "10", 'floating_point_precision': 'single' }) cd = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) # add a couple of floats to both cs.Add( 'aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]' ) cd.Add( 'aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]' ) with tmp_dictionary(cs, 'json_single_precision_float.kv') as ds: with tmp_dictionary(cd, 'json_double_precision_float.kv') as dd: # first some basic checks assert len(ds) == 1 assert len(dd) == 1 # simple test the length of the value store which shall be smaller for single floats stats_s = ds.GetStatistics() stats_d = dd.GetStatistics() assert int(stats_s['Value Store']['size']) < int( stats_d['Value Store']['size'])
def generate_dictionary_compiler(): dictionary_compiler = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"}) for key, value in key_values: dictionary_compiler.Add(key, json.dumps(value)) return dictionary_compiler
def test_near_greedy(): c = pykeyvi.JsonDictionaryCompiler() c.Add("zahnarzt:u0we9yykdyum", '["a" : 2]') c.Add("zahnarzt:u1h2fde2kct3", '["a" : 3]') c.Add("zahnarzt:u1huf1q5cnxn", '["a" : 4]') c.Add("zahnarzt:u0y2dvey61sw", '["a" : 5]') c.Add("zahnarzt:u1hvqmmj801r", '["a" : 6]') c.Add("zahnarzt:u0vvmknrwgmj", '["a" : 7]') c.Add("zahnarzt:u0ypv22fb9q3", '["a" : 8]') c.Add("zahnarzt:u1qcvvw0hxe1", '["a" : 9]') c.Add("zahnarzt:u1xjx6yfvfz2", '["a" : 10]') c.Add("zahnarzt:u1q0gkqsenhf", '["a" : 11]') with tmp_dictionary(c, 'near_greedy.kv') as d: assert (len(list(d.GetNear("zahnarzt:u1q0gkqsenhf", 12, True))) == 2) assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 12, True))) == 3) assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 13, True))) == 0) assert (len(list(d.GetNear("zahnarzt:u0h0gkqsenhf", 10, True))) == 10) greedy = [ x.GetMatchedString() for x in d.GetNear("zahnarzt:u0h0gkqsenhf", 10, True) ] non_greedy = [ x.GetMatchedString() for x in d.GetNear("zahnarzt:u0h0gkqsenhf", 10, False) ] assert greedy[:len(non_greedy)] == non_greedy
def compile(args): params = {key: value for key, value in args.compiler_params} dict_type = args.dict_type if dict_type == 'json': dictionary = pykeyvi.JsonDictionaryCompiler(params) elif dict_type == 'string': dictionary = pykeyvi.StringDictionaryCompiler(params) elif dict_type == 'int': dictionary = pykeyvi.IntDictionaryCompiler(params) elif dict_type == 'completion': dictionary = pykeyvi.CompletionDictionaryCompiler(params) elif dict_type == 'key-only': dictionary = pykeyvi.KeyOnlyDictionaryCompiler(params) else: return 'Must never reach here' with open(args.input_file) as file_in: for line in file_in: line = line.rstrip('\n') try: splits = line.split('\t') if dict_type == 'key-only': dictionary.Add(splits[0]) elif dict_type == 'int' or dict_type == 'completion': dictionary.Add(splits[0], int(splits[1])) else: dictionary.Add(splits[0], splits[1]) except: print ('Can not parse line: {}'.format(line)) dictionary.Compile() dictionary.WriteToFile(args.output_file)
def test_truncated_file_json(): c=pykeyvi.JsonDictionaryCompiler() c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile('truncation_test.kv') size = os.path.getsize('truncation_test.kv') fd_in = open('truncation_test.kv') fd = open('truncation_test1.kv', 'w') fd.write(fd_in.read(size/2)) fd.close() exception_caught = False try: d=pykeyvi.Dictionary('truncation_test1.kv') except ValueError: exception_caught = True assert exception_caught os.remove('truncation_test1.kv') os.remove('truncation_test.kv')
def test_truncated_file_json(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv')) size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv')) fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb') fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb') fd.write(fd_in.read(int(size / 2))) fd.close() exception_caught = False try: d = pykeyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv')) except ValueError: exception_caught = True assert exception_caught os.remove(os.path.join(tmp_dir, 'truncation_test1.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
def generate_dictionary_compiler(): dictionary_compiler = pykeyvi.JsonDictionaryCompiler() for key, value in key_values: dictionary_compiler.Add(key, json.dumps(value)) return dictionary_compiler
def test_near_less_precission(): c = pykeyvi.JsonDictionaryCompiler() c.Add("zahnarzt:u0we9", '["a" : 2]') c.Add("zahnarzt:u1h2f", '["a" : 3]') c.Add("zahnarzt:u1huf", '["a" : 4]') with tmp_dictionary(c, 'near_less_precission.kv') as d: assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 12))) == 2) assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 13))) == 0)
def run_compile(tmpdir): c = pykeyvi.JsonDictionaryCompiler({ "memory_limit_mb": "10", "temporary_path": tmpdir }) c.Add("abc", "{'a':2}") c.Compile() assert os.listdir(test_dir) != []
def test_near_broken_input(): c = pykeyvi.JsonDictionaryCompiler() c.Add("zahnarzt:u0we9", '["a" : 2]') c.Add("zahnarzt:u1h2f", '["a" : 3]') c.Add("zahnarzt:u1huf", '["a" : 4]') with tmp_dictionary(c, 'near_broken.kv') as d: assert (len(list(d.GetNear("zahnarzt:u1h", 12))) == 2) assert (len(list(d.GetNear("zahnarzt:u", 13))) == 0) assert (len(list(d.GetNear("zahnarzt:u1", 12))) == 0)
def test_get_value(): c = pykeyvi.JsonDictionaryCompiler() c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'match_object_json.kv') as d: m = d["abc"] assert m.GetValue() == {"a": 2} m = d["abd"] assert m.GetValue() == {"a": 3}
def generate_keyvi(key_values, filename): dictionary_compiler = pykeyvi.JsonDictionaryCompiler( {"memory_limit_mb": "10"}) for key, value in key_values.items(): dictionary_compiler.Add(key, json.dumps(value)) dictionary_compiler.Compile() dictionary_compiler.WriteToFile(filename)
def test_get_value(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"}) c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'match_object_json.kv') as d: m = d["abc"] assert decode_to_unicode(m.GetValue()) == decode_to_unicode({"a":2}) m = d["abd"] assert decode_to_unicode(m.GetValue()) == decode_to_unicode({"a":3})
def test_raw_serialization(): c = pykeyvi.JsonDictionaryCompiler() c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'match_object_json.kv') as d: m = d["abc"] assert m.GetValueAsString() == '{"a":2}' d = m.dumps() m2 = pykeyvi.Match.loads(d) assert m2.GetValueAsString() == '{"a":2}'
def test_raw_serialization(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"}) c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'match_object_json.kv') as d: m = d["abc"] assert decode_to_unicode(m.GetValueAsString()) == decode_to_unicode('{"a":2}') d = m.dumps() m2 = pykeyvi.Match.loads(d) assert decode_to_unicode(m2.GetValueAsString()) == decode_to_unicode('{"a":2}')
def test_simple(): c = pykeyvi.JsonDictionaryCompiler() c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' with tmp_dictionary(c, 'simple_json.kv') as d: assert len(d) == 2 assert d["abc"].GetValueAsString() == '{"a":2}' assert d["abd"].GetValueAsString() == '{"a":3}'
def test_simple_snappy(): c = pykeyvi.JsonDictionaryCompiler(50000000, {'compression': 'snappy', 'compression_threshold': '0'}) c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'simple_json_snappy.kv') as d: assert len(d) == 2 assert d["abc"].GetValueAsString() == '{"a":2}' assert d["abd"].GetValueAsString() == '{"a":3}' m = d.GetStatistics()['Value Store'] assert m['__compression'] == "snappy" assert m['__compression_threshold'] == "0"
def test_simple(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' with tmp_dictionary(c, 'simple_json.kv') as d: assert len(d) == 2 assert decode_to_unicode( d["abc"].GetValueAsString()) == decode_to_unicode('{"a":2}') assert decode_to_unicode( d["abd"].GetValueAsString()) == decode_to_unicode('{"a":3}')
def test_unicode_compile(): c = pykeyvi.JsonDictionaryCompiler() c.Add("üöä", '{"y" : 2}') c.Add("üüüüüüabd".decode('utf-8'), '{"a" : 3}') c.Add(u"ääääädäd", '{"b" : 33}') with tmp_dictionary(c, 'simple_json.kv') as d: assert len(d) == 3 assert d["üöä"].GetValueAsString() == '{"y":2}' assert d[u"üöä"].GetValueAsString() == '{"y":2}' assert d["üüüüüüabd"].GetValueAsString() == '{"a":3}' assert d["ääääädäd"].GetValueAsString() == '{"b":33}'
def test_leak(): c = pykeyvi.JsonDictionaryCompiler() c.Add("something", '["a" : 2]') with tmp_dictionary(c, 'near_simple.kv') as d: gc.collect() memory_usage_on_start = memory_usage_ps() for i in range(0, 500000): assert not d.get('something_else') if i % 100 == 0: gc.collect() memory_usage_now = memory_usage_ps() assert memory_usage_now < memory_usage_on_start + 15000
def test_unicode(): c = pykeyvi.JsonDictionaryCompiler() c.Add("öäü", '{"a" : 2}') c.Add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' # create unicode string key = "öäü".decode('utf-8') with tmp_dictionary(c, 'unicode_json.kv') as d: assert key in d assert d[key].GetValue() == {"a": 2} assert d.get(key).GetValue() == {"a": 2}
def test_zerobyte(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("\x00abc", '["a" : 2]') c.Add("abc\x00def", '["a" : 3]') c.Add("cd\x00", '["a" : 4]') with tmp_dictionary(c, 'zerobyte.kv') as d: assert decode_to_unicode( d["\x00abc"].GetValue()) == decode_to_unicode('["a" : 2]') assert decode_to_unicode( d["abc\x00def"].GetValue()) == decode_to_unicode('["a" : 3]') assert decode_to_unicode( d["cd\x00"].GetValue()) == decode_to_unicode('["a" : 4]') assert len([(k, v) for k, v in d.GetAllItems()]) == 3
def test_unicode(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("öäü", '{"a" : 2}') c.Add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' # create unicode string key = decode_to_unicode("öäü") with tmp_dictionary(c, 'unicode_json.kv') as d: assert key in d assert decode_to_unicode(d[key].GetValue()) == decode_to_unicode( {"a": 2}) assert decode_to_unicode(d.get(key).GetValue()) == decode_to_unicode( {"a": 2})
def test_tmp_dir_defined(): def run_compile(tmpdir): c = pykeyvi.JsonDictionaryCompiler(1073741824, {"temporary_path": tmpdir}) c.Add("abc", "{'a':2}") c.Compile() assert len(os.listdir(test_dir)) != 0 test_dir = os.path.join(tempfile.gettempdir(), "tmp_dir_test_defined") try: os.mkdir(test_dir) run_compile(test_dir) finally: pykeyvi.JsonDictionaryCompiler() shutil.rmtree(test_dir)
def test_tmp_dir(): cwd = os.getcwd() try: os.mkdir("tmp_dir_test") os.chdir(os.path.join(cwd, "tmp_dir_test")) c = pykeyvi.JsonDictionaryCompiler() c.Add("abc", "{'a':2}") assert len(os.listdir('.')) == 0 c.Compile() assert len(os.listdir('.')) == 0 del c assert len(os.listdir('.')) == 0 finally: os.chdir(cwd) os.rmdir("tmp_dir_test")
def test_tmp_dir(): cwd = os.getcwd() os.chdir(tempfile.gettempdir()) try: os.mkdir("tmp_dir_test") os.chdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test")) c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", "{'a':2}") assert os.listdir('.') == [] c.Compile() assert os.listdir('.') == [] del c assert os.listdir('.') == [] finally: os.chdir(cwd) os.rmdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
def test_unicode_lookup(): c = pykeyvi.JsonDictionaryCompiler() c.Add("Los Angeles", '{"country" : "USA"}') c.Add("Frankfurt am Main", '{"country" : "Germany"}') c.Add("Kirchheim bei München".decode('utf-8'), '{"country" : "Germany"}') # create unicode string for lookup text = "From Los Angeles via Frankfurt am Main to Kirchheim bei München it should just work".decode( 'utf-8') with tmp_dictionary(c, 'unicode_json_lookup.kv') as d: assert "Kirchheim bei München" in d matched_strings = [x.GetMatchedString() for x in d.LookupText(text)] assert len(matched_strings) == 3 assert "Kirchheim bei München" in matched_strings assert "Los Angeles" in matched_strings assert "Frankfurt am Main" in matched_strings
def test_unicode_compile(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("üöä", '{"y" : 2}') c.Add(decode_to_unicode("üüüüüüabd"), '{"a" : 3}') c.Add(u"ääääädäd", '{"b" : 33}') with tmp_dictionary(c, 'simple_json.kv') as d: assert len(d) == 3 assert decode_to_unicode( d["üöä"].GetValueAsString()) == decode_to_unicode('{"y":2}') assert decode_to_unicode( d[u"üöä"].GetValueAsString()) == decode_to_unicode('{"y":2}') assert decode_to_unicode( d["üüüüüüabd"].GetValueAsString()) == decode_to_unicode('{"a":3}') assert decode_to_unicode( d["ääääädäd"].GetValueAsString()) == decode_to_unicode('{"b":33}')
def compile_file(input, output, jobs, shards): skipped_keys = 0 compilers = {} for i in range (0, shards): compilers[i] = pykeyvi.JsonDictionaryCompiler() if os.path.isdir(input): input_files = [os.path.join(input,d) for d in os.listdir(input)] else: input_files = [input] for input_file in input_files: if input_file.endswith(".gz"): input_fd = gzip.open(input_file) else: input_fd = open(input_file) for line in input_fd: try: parts = line.split("\t") key = parts[0] if key != remove_control_chars(key): print "skip key: " + ":".join("{:02x}".format(ord(c)) for c in key) + " due to containing control characters" skipped_keys +=1 value = parts[1] shard = pykeyvi.JumpConsistentHashString(key, shards) compilers[shard].Add(key, value) except: print "failed to add: " + line print "Skipped keys " + str(skipped_keys) for i in range(jobs): t = threading.Thread(target=compile_worker) t.daemon = True t.start() if shards == 1: compile_queue.put((compilers[i], output)) else: for i in range (0, shards): compile_queue.put((compilers[i], output + "-" + str(i))) compile_queue.join()