Example #1
0
def test_manifest_for_merger():
    try:
        c = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
        c.Add("abc", '{"a" : 2}')
        c.Compile()
        c.SetManifest({"author": "Zapp Brannigan"})
        c.WriteToFile('manifest_json_merge1.kv')
        del c

        c2 = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
        c2.Add("abd", '{"a" : 3}')
        c2.Compile()
        c2.SetManifest({"author": "Leela"})
        c2.WriteToFile('manifest_json_merge2.kv')
        del c2

        merger = keyvi.JsonDictionaryMerger({"memory_limit_mb":"10"})
        merger.SetManifest({"author": "Fry"})
        merger.Merge('manifest_json_merged.kv')

        d = keyvi.Dictionary('manifest_json_merged.kv')
        m = d.GetManifest()
        assert m['author'] == "Fry"
        del d

    finally:
        os.remove('manifest_json_merge1.kv')
        os.remove('manifest_json_merge2.kv')
        os.remove('manifest_json_merged.kv')
Example #2
0
def test_float_compaction():
    cs = keyvi.JsonDictionaryCompiler({
        "memory_limit_mb": "10",
        'floating_point_precision': 'single'
    })
    cd = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})

    # add a couple of floats to both
    cs.Add(
        'aa',
        '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]'
    )
    cd.Add(
        'aa',
        '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]'
    )

    with tmp_dictionary(cs, 'json_single_precision_float.kv') as ds:
        with tmp_dictionary(cd, 'json_double_precision_float.kv') as dd:
            # first some basic checks
            assert len(ds) == 1
            assert len(dd) == 1
            # simple test the length of the value store which shall be smaller for single floats
            stats_s = ds.GetStatistics()
            stats_d = dd.GetStatistics()
            assert int(stats_s['Value Store']['size']) < int(
                stats_d['Value Store']['size'])
Example #3
0
def compile(args):
    params = {key: value for key, value in args.compiler_params}

    dict_type = args.dict_type
    if dict_type == 'json':
        dictionary = keyvi.JsonDictionaryCompiler(params)
    elif dict_type == 'string':
        dictionary = keyvi.StringDictionaryCompiler(params)
    elif dict_type == 'int':
        dictionary = keyvi.IntDictionaryCompiler(params)
    elif dict_type == 'completion':
        dictionary = keyvi.CompletionDictionaryCompiler(params)
    elif dict_type == 'key-only':
        dictionary = keyvi.KeyOnlyDictionaryCompiler(params)
    else:
        return 'Must never reach here'

    with open(args.input_file) as file_in:
        for line in file_in:
            line = line.rstrip('\n')
            try:
                splits = line.split('\t')
                if dict_type == 'key-only':
                    dictionary.Add(splits[0])
                elif dict_type == 'int' or dict_type == 'completion':
                    dictionary.Add(splits[0], int(splits[1]))
                else:
                    dictionary.Add(splits[0], splits[1])
            except:
                print ('Can not parse line: {}'.format(line))

    dictionary.Compile()
    dictionary.WriteToFile(args.output_file)
Example #4
0
def test_truncated_file_json():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add('a', '{1:2}')
    c.Add('b', '{2:4}')
    c.Add('c', '{4:4}')
    c.Add('d', '{2:3}')
    c.Compile()

    c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv'))
    size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv'))

    fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb')
    fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb')
    fd.write(fd_in.read(int(size / 2)))
    fd.close()

    fd2 = open(os.path.join(tmp_dir, 'truncation_test2.kv'), 'wb')
    fd2.write(fd_in.read(int(size - 2)))
    fd2.close()

    with pytest.raises(ValueError):
        d = keyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv'))
    with pytest.raises(ValueError):
        d = keyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test2.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test2.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test1.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
Example #5
0
def test_truncated_file_json():
    c=keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add('a', '{1:2}')
    c.Add('b', '{2:4}')
    c.Add('c', '{4:4}')
    c.Add('d', '{2:3}')
    c.Compile()

    c.WriteToFile(os.path.join(tmp_dir,'truncation_test.kv'))
    size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv'))

    fd_in = open(os.path.join(tmp_dir,'truncation_test.kv'), 'rb')
    fd = open(os.path.join(tmp_dir,'truncation_test1.kv'), 'wb')
    fd.write(fd_in.read(int(size/2)))
    fd.close()

    exception_caught = False
    try:
        d=keyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv'))
    except ValueError:
        exception_caught = True

    assert exception_caught
    os.remove(os.path.join(tmp_dir, 'truncation_test1.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
Example #6
0
def test_near_greedy():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("zahnarzt:u0we9yykdyum", '["a" : 2]')
    c.Add("zahnarzt:u1h2fde2kct3", '["a" : 3]')
    c.Add("zahnarzt:u1huf1q5cnxn", '["a" : 4]')
    c.Add("zahnarzt:u0y2dvey61sw", '["a" : 5]')
    c.Add("zahnarzt:u1hvqmmj801r", '["a" : 6]')
    c.Add("zahnarzt:u0vvmknrwgmj", '["a" : 7]')
    c.Add("zahnarzt:u0ypv22fb9q3", '["a" : 8]')
    c.Add("zahnarzt:u1qcvvw0hxe1", '["a" : 9]')
    c.Add("zahnarzt:u1xjx6yfvfz2", '["a" : 10]')
    c.Add("zahnarzt:u1q0gkqsenhf", '["a" : 11]')
    with tmp_dictionary(c, 'near_greedy.kv') as d:
        assert (len(list(d.GetNear("zahnarzt:u1q0gkqsenhf", 12, True))) == 2)
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 12, True))) == 3)
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 13, True))) == 0)
        assert (len(list(d.GetNear("zahnarzt:u0h0gkqsenhf", 10, True))) == 10)

        greedy = [
            x.GetMatchedString()
            for x in d.GetNear("zahnarzt:u0h0gkqsenhf", 10, True)
        ]
        non_greedy = [
            x.GetMatchedString()
            for x in d.GetNear("zahnarzt:u0h0gkqsenhf", 10, False)
        ]
        assert greedy[:len(non_greedy)] == non_greedy
Example #7
0
def generate_dictionary_compiler():

    dictionary_compiler = keyvi.JsonDictionaryCompiler(
        {"memory_limit_mb": "10"})
    for key, value in key_values:
        dictionary_compiler.Add(key, json.dumps(value))

    return dictionary_compiler
Example #8
0
def generate_keyvi(key_values, filename):

    dictionary_compiler = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    for key, value in key_values.items():
        dictionary_compiler.Add(key, json.dumps(value))

    dictionary_compiler.Compile()
    dictionary_compiler.WriteToFile(filename)
Example #9
0
 def run_compile(tmpdir):
     c = keyvi.JsonDictionaryCompiler({
         "memory_limit_mb": "10",
         "temporary_path": tmpdir
     })
     c.Add("abc", "{'a':2}")
     c.Compile()
     assert os.listdir(test_dir) != []
Example #10
0
def test_near_less_precission():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("zahnarzt:u0we9", '["a" : 2]')
    c.Add("zahnarzt:u1h2f", '["a" : 3]')
    c.Add("zahnarzt:u1huf", '["a" : 4]')
    with tmp_dictionary(c, 'near_less_precission.kv') as d:
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 12))) == 2)
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 13))) == 0)
Example #11
0
def test_get_value():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    with tmp_dictionary(c, 'match_object_json.kv') as d:
        m = d["abc"]
        assert decode_to_unicode(m.GetValue()) == decode_to_unicode({"a": 2})
        m = d["abd"]
        assert decode_to_unicode(m.GetValue()) == decode_to_unicode({"a": 3})
Example #12
0
def test_near_broken_input():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("zahnarzt:u0we9", '["a" : 2]')
    c.Add("zahnarzt:u1h2f", '["a" : 3]')
    c.Add("zahnarzt:u1huf", '["a" : 4]')
    with tmp_dictionary(c, 'near_broken.kv') as d:
        assert (len(list(d.GetNear("zahnarzt:u1h", 12))) == 2)
        assert (len(list(d.GetNear("zahnarzt:u", 13))) == 0)
        assert (len(list(d.GetNear("zahnarzt:u1", 12))) == 0)
Example #13
0
def test_simple():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    # use python syntax ala __setitem__
    c["abd"] = '{"a" : 3}'
    with tmp_dictionary(c, 'simple_json.kv') as d:
        assert len(d) == 2
        assert d["abc"].GetValueAsString() == '{"a":2}'
        assert d["abd"].GetValueAsString() == '{"a":3}'
Example #14
0
def test_zerobyte():
    c=keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add("\x00abc", '["a" : 2]')
    c.Add("abc\x00def", '["a" : 3]')
    c.Add("cd\x00", '["a" : 4]')
    with tmp_dictionary(c, 'zerobyte.kv') as d:
        assert decode_to_unicode(d["\x00abc"].GetValue()) == decode_to_unicode('["a" : 2]')
        assert decode_to_unicode(d["abc\x00def"].GetValue()) == decode_to_unicode('["a" : 3]')
        assert decode_to_unicode(d["cd\x00"].GetValue()) == decode_to_unicode('["a" : 4]')
        assert len([(k, v) for k, v in d.GetAllItems()]) == 3
Example #15
0
def test_raw_serialization():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    with tmp_dictionary(c, 'match_object_json.kv') as d:
        m = d["abc"]
        assert m.GetValueAsString() == '{"a":2}'
        d = m.dumps()
        m2 = keyvi.Match.loads(d)
        assert m2.GetValueAsString() == '{"a":2}'
Example #16
0
def test_simple_snappy():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10", 'compression': 'snappy', 'compression_threshold': '0'})
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    with tmp_dictionary(c, 'simple_json_snappy.kv') as d:
        assert len(d) == 2
        assert d["abc"].GetValueAsString() == '{"a":2}'
        assert d["abd"].GetValueAsString() == '{"a":3}'
        m = d.GetStatistics()['Value Store']
        assert m['__compression'] == "snappy"
        assert m['__compression_threshold'] == "0"
Example #17
0
def test_unicode_compile():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add("üöä", '{"y" : 2}')
    c.Add("üüüüüüabd", '{"a" : 3}')
    c.Add(u"ääääädäd", '{"b" : 33}')

    with tmp_dictionary(c, 'simple_json.kv') as d:
        assert len(d) == 3
        assert d["üöä"].GetValueAsString() == '{"y":2}'
        assert d[u"üöä"].GetValueAsString() == '{"y":2}'
        assert d["üüüüüüabd"].GetValueAsString() == '{"a":3}'
        assert d["ääääädäd"].GetValueAsString() == '{"b":33}'
Example #18
0
def test_leak():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb":"10"})
    c.Add("something", '["a" : 2]')

    with tmp_dictionary(c, 'near_simple.kv') as d:
        gc.collect()
        memory_usage_on_start = memory_usage_ps()
        for i in range(0, 500000):
            assert not d.get('something_else')
            if i % 100 == 0:
                gc.collect()
                memory_usage_now = memory_usage_ps()
                assert memory_usage_now < memory_usage_on_start + 15000
Example #19
0
def test_unicode():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("öäü", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    # use python syntax ala __setitem__
    c["abd"] = '{"a" : 3}'

    # create unicode string
    key = "öäü"
    with tmp_dictionary(c, 'unicode_json.kv') as d:
        assert key in d
        assert d[key].GetValue() == {"a": 2}
        assert d.get(key).GetValue() == {"a": 2}
Example #20
0
def compile_file(input, output, jobs, shards):
    skipped_keys = 0

    compilers = {}
    for i in range(0, shards):
        compilers[i] = keyvi.JsonDictionaryCompiler()

    if os.path.isdir(input):
        input_files = [os.path.join(input, d) for d in os.listdir(input)]
    else:
        input_files = [input]

    for input_file in input_files:
        if input_file.endswith(".gz"):
            input_fd = gzip.open(input_file)
        else:
            input_fd = open(input_file)

        for line in input_fd:
            try:
                parts = line.split("\t")
                key = parts[0]

                if key != remove_control_chars(key):
                    print "skip key: " + ":".join(
                        "{:02x}".format(ord(c))
                        for c in key) + " due to containing control characters"
                    skipped_keys += 1

                value = parts[1]

                shard = keyvi.JumpConsistentHashString(key, shards)
                compilers[shard].Add(key, value)
            except:
                print "failed to add: " + line
        print "Skipped keys " + str(skipped_keys)

    for i in range(jobs):
        t = threading.Thread(target=compile_worker)
        t.daemon = True
        t.start()

    if shards == 1:
        compile_queue.put((compilers[i], output))
    else:
        for i in range(0, shards):
            compile_queue.put((compilers[i], output + "-" + str(i)))

    compile_queue.join()
Example #21
0
def test_unicode_lookup():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("Los Angeles", '{"country" : "USA"}')
    c.Add("Frankfurt am Main", '{"country" : "Germany"}')
    c.Add("Kirchheim bei München", '{"country" : "Germany"}')

    # create unicode string for lookup
    text = "From Los Angeles via Frankfurt am Main to Kirchheim bei München it should just work"
    with tmp_dictionary(c, 'unicode_json_lookup.kv') as d:
        assert "Kirchheim bei München" in d
        matched_strings = [x.GetMatchedString() for x in d.LookupText(text)]
        assert len(matched_strings) == 3
        assert u"Kirchheim bei München" in matched_strings
        assert u"Los Angeles" in matched_strings
        assert u"Frankfurt am Main" in matched_strings
Example #22
0
def test_tmp_dir():
    cwd = os.getcwd()
    os.chdir(tempfile.gettempdir())
    try:
        os.mkdir("tmp_dir_test")
        os.chdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
        c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c.Add("abc", "{'a':2}")
        assert os.listdir('.') == []
        c.Compile()
        assert os.listdir('.') == []
        del c
        assert os.listdir('.') == []
    finally:
        os.chdir(cwd)
        os.rmdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
Example #23
0
def test_near():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("zahnarzt:u0we9yykdyum", '["a" : 2]')
    c.Add("zahnarzt:u1h2fde2kct3", '["a" : 3]')
    c.Add("zahnarzt:u1huf1q5cnxn", '["a" : 4]')
    c.Add("zahnarzt:u0y2dvey61sw", '["a" : 5]')
    c.Add("zahnarzt:u1hvqmmj801r", '["a" : 6]')
    c.Add("zahnarzt:u0vvmknrwgmj", '["a" : 7]')
    c.Add("zahnarzt:u0ypv22fb9q3", '["a" : 8]')
    c.Add("zahnarzt:u1qcvvw0hxe1", '["a" : 9]')
    c.Add("zahnarzt:u1xjx6yfvfz2", '["a" : 10]')
    c.Add("zahnarzt:u1q0gkqsenhf", '["a" : 11]')
    with tmp_dictionary(c, 'near_simple.kv') as d:
        assert (len(list(d.GetNear("zahnarzt:u1q0gkqsenhf", 12))) == 1)
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 12))) == 3)
        assert (len(list(d.GetNear("zahnarzt:u1h0gkqsenhf", 13))) == 0)
        assert (len(list(d.GetNear("zahnarzt:u0h0gkqsenhf", 10))) == 4)
Example #24
0
def test_tmp_dir_defined():
    def run_compile(tmpdir):
        c = keyvi.JsonDictionaryCompiler({
            "memory_limit_mb": "10",
            "temporary_path": tmpdir
        })
        c.Add("abc", "{'a':2}")
        c.Compile()
        assert os.listdir(test_dir) != []

    test_dir = os.path.join(tempfile.gettempdir(), "tmp_dir_test_defined")
    try:
        os.mkdir(test_dir)
        run_compile(test_dir)
    finally:
        keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
        shutil.rmtree(test_dir)
Example #25
0
def test_input_output_keys():
    compiler = keyvi.JsonDictionaryCompiler({'compression_threshold': '32', 'compression': 'zlib'})
    input_keys_count = 0
    with open(os.path.join(root, 'var_length_short_calculation_test_data.tsv')) as f_in:
        for line in f_in:
            k, v = line.split('\t')
            key = json.loads(k)
            value = json.loads(v)
            compiler.Add(key, value)
            input_keys_count += 1

    output_keys_count = 0
    with tmp_dictionary(compiler, 'var_length_short_test.kv') as d:
        for _ in d.GetAllItems():
            output_keys_count += 1

    assert input_keys_count == output_keys_count
Example #26
0
def test_simple_zlib():
    c = keyvi.JsonDictionaryCompiler({
        "memory_limit_mb": "10",
        'compression': 'z',
        'compression_threshold': '0'
    })
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    with tmp_dictionary(c, 'simple_json_z.kv') as d:
        assert len(d) == 2
        assert decode_to_unicode(
            d["abc"].GetValueAsString()) == decode_to_unicode('{"a":2}')
        assert decode_to_unicode(
            d["abd"].GetValueAsString()) == decode_to_unicode('{"a":3}')
        m = d.GetStatistics()['Value Store']
        assert m['__compression'] == decode_to_unicode("zlib")
        assert m['__compression_threshold'] == decode_to_unicode("0")
Example #27
0
def test_near_score():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("zahnarzt:u0we9yykdyum", '["a" : 2]')
    c.Add("zahnarzt:u1h2fde2kct3", '["a" : 3]')
    c.Add("zahnarzt:u1huf1q5cnxn", '["a" : 4]')
    c.Add("zahnarzt:u0y2dvey61sw", '["a" : 5]')
    c.Add("zahnarzt:u1hvqmmj801r", '["a" : 6]')
    c.Add("zahnarzt:u0vvmknrwgmj", '["a" : 7]')
    c.Add("zahnarzt:u0ypv22fb9q3", '["a" : 8]')
    c.Add("zahnarzt:u1qcvvw0hxe1", '["a" : 9]')
    c.Add("zahnarzt:u1xjx6yfvfz2", '["a" : 10]')
    c.Add("zahnarzt:u1q0gkqsenhf", '["a" : 11]')
    c.Add("zahnarzt:u0h0gkqsenhf", '["a" : 11]')

    with tmp_dictionary(c, 'near_score.kv') as d:
        greedy = list(d.GetNear("zahnarzt:u0h0gkqsenhf", 10, True))
        assert greedy[0].GetScore() == 21
        for m in greedy[1:5]:
            assert m.GetScore() == 11
        for m in greedy[5:]:
            assert m.GetScore() == 10
Example #28
0
def test_compiler_empty_json():
    c = keyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    with test_tools.tmp_dictionary(c, 'empty_json.kv') as d:
        assert len(d) == 0