def test_unicode_compile(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("üöä", '{"y" : 2}') c.Add("üüüüüüabd", '{"a" : 3}') c.Add(u"ääääädäd", '{"b" : 33}') with tmp_dictionary(c, 'simple_json.kv') as d: assert len(d) == 3 assert d["üöä"].GetValueAsString() == '{"y":2}' assert d[u"üöä"].GetValueAsString() == '{"y":2}' assert d["üüüüüüabd"].GetValueAsString() == '{"a":3}' assert d["ääääädäd"].GetValueAsString() == '{"b":33}'
def test_unicode(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("öäü", '{"a" : 2}') c.Add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' # create unicode string key = "öäü" with tmp_dictionary(c, 'unicode_json.kv') as d: assert key in d assert d[key].GetValue() == {"a": 2} assert d.get(key).GetValue() == {"a": 2}
def test_leak(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("something", '["a" : 2]') with tmp_dictionary(c, 'near_simple.kv') as d: gc.collect() memory_usage_on_start = memory_usage_ps() for i in range(0, 500000): assert not d.get('something_else') if i % 100 == 0: gc.collect() memory_usage_now = memory_usage_ps() assert memory_usage_now < memory_usage_on_start + 15000
def test_simple_snappy(): c = JsonDictionaryCompiler({ "memory_limit_mb": "10", 'compression': 'snappy', 'compression_threshold': '0' }) c.Add("abc", '{"a" : 2}') c.Add("abd", '{"a" : 3}') with tmp_dictionary(c, 'simple_json_snappy.kv') as d: assert len(d) == 2 assert d["abc"].GetValueAsString() == '{"a":2}' assert d["abd"].GetValueAsString() == '{"a":3}' m = d.GetStatistics()['Value Store'] assert m['__compression'] == "snappy"
def test_unicode_lookup(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("Los Angeles", '{"country" : "USA"}') c.Add("Frankfurt am Main", '{"country" : "Germany"}') c.Add("Kirchheim bei München", '{"country" : "Germany"}') # create unicode string for lookup text = "From Los Angeles via Frankfurt am Main to Kirchheim bei München it should just work" with tmp_dictionary(c, 'unicode_json_lookup.kv') as d: assert "Kirchheim bei München" in d matched_strings = [x.GetMatchedString() for x in d.LookupText(text)] assert len(matched_strings) == 3 assert u"Kirchheim bei München" in matched_strings assert u"Los Angeles" in matched_strings assert u"Frankfurt am Main" in matched_strings
def test_tmp_dir(): cwd = os.getcwd() os.chdir(tempfile.gettempdir()) try: os.mkdir("tmp_dir_test") os.chdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test")) c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", "{'a':2}") assert os.listdir('.') == [] c.Compile() assert os.listdir('.') == [] del c assert os.listdir('.') == [] finally: os.chdir(cwd) os.rmdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
def test_float_compaction(): cs = JsonDictionaryCompiler({ "memory_limit_mb": "10", 'floating_point_precision': 'single' }) cd = JsonDictionaryCompiler({"memory_limit_mb": "10"}) # add a couple of floats to both cs.Add( 'aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]' ) cd.Add( 'aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]' ) with tmp_dictionary(cs, 'json_single_precision_float.kv') as ds: with tmp_dictionary(cd, 'json_double_precision_float.kv') as dd: # first some basic checks assert len(ds) == 1 assert len(dd) == 1 # simple test the length of the value store which shall be smaller for single floats stats_s = ds.GetStatistics() stats_d = dd.GetStatistics() assert int(stats_s['Value Store']['size']) < int( stats_d['Value Store']['size'])
def test_truncated_file_json(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv')) size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv')) fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb') fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb') fd.write(fd_in.read(int(size / 2))) fd.close() fd2 = open(os.path.join(tmp_dir, 'truncation_test2.kv'), 'wb') fd2.write(fd_in.read(int(size - 2))) fd2.close() with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv')) with pytest.raises(ValueError): d = Dictionary(os.path.join(tmp_dir, 'truncation_test2.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test2.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test1.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
def test_input_output_keys(): compiler = JsonDictionaryCompiler({ 'compression_threshold': '32', 'compression': 'zlib', "memory_limit_mb": "10" }) input_keys_count = 0 with open(os.path.join( root, 'var_length_short_calculation_test_data.tsv')) as f_in: for line in f_in: k, v = line.split('\t') key = json.loads(k) value = json.loads(v) compiler.Add(key, value) input_keys_count += 1 output_keys_count = 0 with tmp_dictionary(compiler, 'var_length_short_test.kv') as d: for _ in d.GetAllItems(): output_keys_count += 1 assert input_keys_count == output_keys_count
def compile_file(input, output, jobs, shards): skipped_keys = 0 compilers = {} for i in range(0, shards): compilers[i] = JsonDictionaryCompiler() if os.path.isdir(input): input_files = [os.path.join(input, d) for d in os.listdir(input)] else: input_files = [input] for input_file in input_files: if input_file.endswith(".gz"): input_fd = gzip.open(input_file) else: input_fd = open(input_file) for line in input_fd: try: parts = line.split("\t") key = parts[0] if key != remove_control_chars(key): print("skip key: " + ":".join("{:02x}".format(ord(c)) for c in key) + " due to containing control characters") skipped_keys += 1 value = parts[1] shard = JumpConsistentHashString(key, shards) compilers[shard].Add(key, value) except: print("failed to add: " + line) print("Skipped keys " + str(skipped_keys)) for i in range(jobs): t = threading.Thread(target=compile_worker) t.daemon = True t.start() if shards == 1: compile_queue.put((compilers[i], output)) else: for i in range(0, shards): compile_queue.put((compilers[i], output + "-" + str(i))) compile_queue.join()
def test_tmp_dir_defined(): def run_compile(tmpdir): c = JsonDictionaryCompiler({ "memory_limit_mb": "10", "temporary_path": tmpdir }) c.Add("abc", "{'a':2}") c.Compile() assert os.listdir(tmpdir) != [] test_dir = os.path.join(tempfile.gettempdir(), "tmp_dir_test_defined") try: os.mkdir(test_dir) run_compile(test_dir) finally: gc.collect() JsonDictionaryCompiler({"memory_limit_mb": "10"}) shutil.rmtree(test_dir)
def test_compiler_empty_json(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) with test_tools.tmp_dictionary(c, 'empty_json.kv') as d: assert len(d) == 0
def compile(args): params = {key: value for key, value in args.compiler_params} dict_type = args.dict_type if dict_type == 'json': dictionary = JsonDictionaryCompiler(params) elif dict_type == 'string': dictionary = StringDictionaryCompiler(params) elif dict_type == 'int': dictionary = IntDictionaryCompiler(params) elif dict_type == 'completion': dictionary = CompletionDictionaryCompiler(params) elif dict_type == 'key-only': dictionary = KeyOnlyDictionaryCompiler(params) else: return 'Must never reach here' with open(args.input_file) as file_in: for line in file_in: line = line.rstrip('\n') try: splits = line.split('\t') if dict_type == 'key-only': dictionary.Add(splits[0]) elif dict_type == 'int' or dict_type == 'completion': dictionary.Add(splits[0], int(splits[1])) else: dictionary.Add(splits[0], splits[1]) except: print('Can not parse line: {}'.format(line)) dictionary.Compile() dictionary.WriteToFile(args.output_file)
def test_manifest_for_merger(): try: c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", '{"a" : 2}') c.Compile() c.SetManifest('{"author": "Zapp Brannigan"}') c.WriteToFile('manifest_json_merge1.kv') del c c2 = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c2.Add("abd", '{"a" : 3}') c2.Compile() c2.SetManifest('{"author": "Leela"}') c2.WriteToFile('manifest_json_merge2.kv') del c2 merger = JsonDictionaryMerger({"memory_limit_mb": "10"}) merger.SetManifest('{"author": "Fry"}') merger.Merge('manifest_json_merged.kv') d = Dictionary('manifest_json_merged.kv') m = json.loads(d.GetManifest()) assert m['author'] == "Fry" del d finally: os.remove('manifest_json_merge1.kv') os.remove('manifest_json_merge2.kv') os.remove('manifest_json_merged.kv')