Esempio n. 1
0
def test_merge(merger):
    tmp_dir = tempfile.mkdtemp()
    try:
        file_1 = path.join(tmp_dir, 'test_merger_1.kv')
        file_2 = path.join(tmp_dir, 'test_merger_2.kv')
        file_3 = path.join(tmp_dir, 'test_merger_3.kv')
        merge_file = path.join(tmp_dir, 'merge.kv')

        generate_keyvi(keys_1, file_1)
        generate_keyvi(keys_2, file_2)
        generate_keyvi(keys_3, file_3)

        merger.Add(file_1)
        merger.Add(file_2)
        merger.Add(file_3)
        merger.Merge(merge_file)

        merged_dictionary = pykeyvi.Dictionary(merge_file)

        keys = set()
        keys.update(keys_1)
        keys.update(keys_2)
        keys.update(keys_3)

        keys_ordered = sorted(keys)

        for base_key, keyvi_key in zip(keys_ordered,
                                       merged_dictionary.GetAllKeys()):
            assert decode_to_unicode(base_key) == decode_to_unicode(keyvi_key)

    finally:
        shutil.rmtree(tmp_dir)
Esempio n. 2
0
def tmp_dictionary(compiler, file_name):
    compiler.Compile()
    compiler.WriteToFile(file_name)
    d = pykeyvi.Dictionary(file_name)
    yield d
    del d
    os.remove(file_name)
Esempio n. 3
0
def test_truncated_file_json():
    c=pykeyvi.JsonDictionaryCompiler()
    c.Add('a', '{1:2}')
    c.Add('b', '{2:4}')
    c.Add('c', '{4:4}')
    c.Add('d', '{2:3}')
    c.Compile()

    c.WriteToFile('truncation_test.kv')
    size = os.path.getsize('truncation_test.kv')

    fd_in = open('truncation_test.kv')
    fd = open('truncation_test1.kv', 'w')
    fd.write(fd_in.read(size/2))
    fd.close()

    exception_caught = False
    try:
        d=pykeyvi.Dictionary('truncation_test1.kv')
    except ValueError:
        exception_caught = True

    assert exception_caught
    os.remove('truncation_test1.kv')
    os.remove('truncation_test.kv')
Esempio n. 4
0
def test_truncated_file_json():
    c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add('a', '{1:2}')
    c.Add('b', '{2:4}')
    c.Add('c', '{4:4}')
    c.Add('d', '{2:3}')
    c.Compile()

    c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv'))
    size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv'))

    fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb')
    fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb')
    fd.write(fd_in.read(int(size / 2)))
    fd.close()

    exception_caught = False
    try:
        d = pykeyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv'))
    except ValueError:
        exception_caught = True

    assert exception_caught
    os.remove(os.path.join(tmp_dir, 'truncation_test1.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
Esempio n. 5
0
def test_manifest_for_merger():
    try:
        c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c.Add("abc", '{"a" : 2}')
        c.Compile()
        c.SetManifest({"author": "Zapp Brannigan"})
        c.WriteToFile('manifest_json_merge1.kv')
        del c

        c2 = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c2.Add("abd", '{"a" : 3}')
        c2.Compile()
        c2.SetManifest({"author": "Leela"})
        c2.WriteToFile('manifest_json_merge2.kv')
        del c2

        merger = pykeyvi.JsonDictionaryMerger({"memory_limit_mb": "10"})
        merger.SetManifest({"author": "Fry"})
        merger.Merge('manifest_json_merged.kv')

        d = pykeyvi.Dictionary('manifest_json_merged.kv')
        m = d.GetManifest()
        assert m['author'] == "Fry"
        del d

    finally:
        os.remove('manifest_json_merge1.kv')
        os.remove('manifest_json_merge2.kv')
        os.remove('manifest_json_merged.kv')
Esempio n. 6
0
def test_merge(merger):
    tmp_dir = tempfile.mkdtemp()
    try:
        file_1 = path.join(tmp_dir, 'test_merger_1.kv')
        file_2 = path.join(tmp_dir, 'test_merger_2.kv')
        file_3 = path.join(tmp_dir, 'test_merger_3.kv')
        merge_file = path.join(tmp_dir, 'merge.kv')

        generate_keyvi(key_values_1, file_1)
        generate_keyvi(key_values_2, file_2)
        generate_keyvi(key_values_3, file_3)

        merger.Add(file_1)
        merger.Add(file_2)
        merger.Add(file_3)
        merger.Merge(merge_file)

        merged_dictionary = pykeyvi.Dictionary(merge_file)

        key_values = {}
        key_values.update(key_values_1)
        key_values.update(key_values_2)
        key_values.update(key_values_3)

        key_values_ordered = collections.OrderedDict(sorted(
            key_values.items()))

        for (base_key, base_value), (keyvi_key, keyvi_value) in zip(
                key_values_ordered.items(), merged_dictionary.GetAllItems()):
            assert decode_to_unicode(base_key) == decode_to_unicode(keyvi_key)
            assert base_value == keyvi_value

    finally:
        shutil.rmtree(tmp_dir)
Esempio n. 7
0
def tmp_dictionary(compiler, file_name):

    tmp_dir = tempfile.gettempdir()
    fq_file_name = os.path.join(tmp_dir, file_name)
    compiler.Compile()
    compiler.WriteToFile(fq_file_name)
    del compiler
    d = pykeyvi.Dictionary(fq_file_name)
    yield d
    del d
    os.remove(fq_file_name)
Esempio n. 8
0
def test_invalid_filemagic():
    fd = open('broken_file','w')
    fd.write ('dead beef')
    fd.close()
    exception_caught = False
    try:
        d=pykeyvi.Dictionary('broken_file')
    except ValueError:
        exception_caught = True

    assert exception_caught
    os.remove('broken_file')
Esempio n. 9
0
def dump(args):
    dictionary = pykeyvi.Dictionary(args.input_file)
    with open(args.output_file, 'w') as file_out:
        for key, value in dictionary.GetAllItems():
            if args.json_dumps:
                key = json.dumps(key)
            if isinstance(key, bytes):
                key = key.decode()
            file_out.write(key)
            if value:
                if args.json_dumps:
                    value = json.dumps(value)
                file_out.write('\t{}'.format(value))
            file_out.write('\n')
Esempio n. 10
0
def test_manifest_after_compile():
    c = pykeyvi.KeyOnlyDictionaryCompiler()
    c.Add("Leela")
    c.Add("Kif")
    c.Compile()
    c.SetManifest({"author": "Zapp Brannigan"})
    file_name = 'brannigan_manifest2.kv'
    try:
        c.WriteToFile(file_name)
        d = pykeyvi.Dictionary(file_name)
        m = d.GetManifest()
        assert m['author'] == "Zapp Brannigan"
        del d
    finally:
        os.remove(file_name)
Esempio n. 11
0
def test_manifest_after_compile():
    c = pykeyvi.KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("Leela")
    c.Add("Kif")
    c.Compile()
    c.SetManifest({"author": "Zapp Brannigan"})
    file_name = os.path.join(tempfile.gettempdir(), 'brannigan_manifest2.kv')
    try:
        c.WriteToFile(file_name)
        d = pykeyvi.Dictionary(file_name)
        m = d.GetManifest()
        assert m['author'] == "Zapp Brannigan"
        del d
    finally:
        os.remove(file_name)
Esempio n. 12
0
__author__ = 'ankit'
import pykeyvi
import os

keyviquerydatadir = ""
high_freq_queries = {}

for root, dirs, files in os.walk(keyviquerydatadir):
		path = root.split('/')
		for file in files:
			if (file.lower().startwith('query_ucrawl')):
                keyvifilePath = os.path.join(root,str(file))
                d = pykeyvi.Dictionary(keyvifilePath)
                allkeys = d.GetAllKeys()
                for query in allkeys:
                    query_parts = query.split(" ")
                    if len(query_parts) >=3:
                        #print query_parts[1:]
                        q = query_parts[1:]
                        if high_freq_queries.has_key(q):
                            high_freq_queries[q] = high_freq_queries.get(q) + 1
                        else:
                            high_freq_queries[q] = 1



Esempio n. 13
0
import sys
import pykeyvi

d = pykeyvi.Dictionary("normalization.keyvi")
n = pykeyvi.FsaTransform(d)

for line in sys.stdin:
    print n.Normalize(line)
Esempio n. 14
0
import msgpack
import zlib
import snappy
import pykeyvi


def decode_value(value):
    """Decodes a cliqztionary value."""
    if value is None or len(value) == 0:
        return None
    elif len(value) > 1 and value[0] == ' ':
        value = zlib.decompress(value[1:])
    return msgpack.loads(value)


count = 0
d = pykeyvi.Dictionary('pagemodels.kv-0')
items = d.GetAllItems()
for key, value in items:
    if (count == 20):
        break
    print decode_value(value)
    count += 1
Esempio n. 15
0
def stats(input_file):
    print (json.dumps(pykeyvi.Dictionary(input_file).GetStatistics(), indent=4, sort_keys=True))
Esempio n. 16
0
import pykeyvi

query = ""

d = pykeyvi.Dictionary("your-own.keyvi")


def get_lookup_key(query):
    return query


while query != "exit":
    query = raw_input("Query:")
    for m in d.Get(get_lookup_key(query.strip())):
        print "{} {}".format(m.GetMatchedString(), m.GetValueAsString())
Esempio n. 17
0
import kenlm
model = kenlm.Model("/run/shm/lm_word.bin")
model.perplexity("formatter python files")

from rust_qpick import Qpick
import time
import os
import pykeyvi
os.environ['RUST_BACKTRACE']='full'
qpick = Qpick('/root/dragan/index/')
i2q = pykeyvi.Dictionary('/run/shm/i2q/test_merge.kv')
s = time.time(); res = list(qpick.get('clear tcp connections linux command line', 10)); time.time() -s

qp = Qpick(dir_path="/home/dnc/workspace/cliqz/qpick/index/", start_shard=10, end_shard=15)

qs = [(d, i2q.get(str(id)).GetValue()) for (id, d) in res]

---
supervisorctl start nmslib-10000
...
supervisorctl start nmslib-10009
---
from rust_qpick import Qpick
import time
import os
import pykeyvi
os.environ['RUST_BACKTRACE']='1'
qpick = Qpick('/root/dragan/index/')
i2q = pykeyvi.Dictionary('/run/shm/i2q/test_merge.kv')
s = time.time(); res = list(qpick.nget(
    [
Esempio n. 18
0
import pykeyvi

query = ""

d = pykeyvi.Dictionary("cities.keyvi")


def get_lookup_key(query):
    return query


while query != "exit":
    query = raw_input("Query:")
    for m in d.LookupText(get_lookup_key(query.strip())):
        print "{}".format(m.GetMatchedString())
Esempio n. 19
0
import pykeyvi

query = ""

d = pykeyvi.Dictionary("prefix-completion.keyvi")
c = pykeyvi.PrefixCompletion(d)


def get_lookup_key(query):
    return query


while query != "exit":
    query = raw_input("Query:")
    for m in c.GetCompletions(get_lookup_key(query.strip())):
        print "{} {}".format(m.GetMatchedString(), m.GetAttribute("weight"))
Esempio n. 20
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from gensim.models.doc2vec import Doc2Vec
import pykeyvi

docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv"
output_data_path = "/raid/ankit/doc2vec/out_s_p_1M"
doc2vec_trained_model = 'pages_with_spaces.doc2vec'
_alpha, _min_alpha, _passes = (0.020, 0.001, 20)

print "Loading keyvi dictionaries ..."
keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file))
print "Finished Loading key-vi Dictionary."

print "Loading Doc2Vec Model ... "
model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model))
print "Model Loaded Successfully!"


def get_similar_urls(sample_query, nearest_num):
    tokens = sample_query.lower().split()
    dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes)     # note: may want to use many more steps than default
    sims = model.docvecs.most_similar(positive=[dv],  topn=nearest_num)
    for url_id, distance in sims:
        url = ""
        for m in keyvi_dict.Get(str(url_id)):
            url = m.GetValueAsString()
        print "{}\t{}\t{}".format(url_id, url, distance)

def main():
    print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
import pykeyvi

MULTIWORD_QUERY_SEPARATOR = '\x1b'

query = ""

d = pykeyvi.Dictionary("mw-completion.keyvi")
c = pykeyvi.MultiWordCompletion(d)


def get_lookup_key(query):
    l = query.split(" ")
    l_bow = " ".join(sorted(l[:-1]) + l[-1:])

    return l_bow


while query != "exit":
    query = raw_input("Query:")
    for m in c.GetCompletions(get_lookup_key(query.strip())):
        print "{} {}".format(m.GetMatchedString(), m.GetAttribute("weight"))