def test_merge(merger): tmp_dir = tempfile.mkdtemp() try: file_1 = path.join(tmp_dir, 'test_merger_1.kv') file_2 = path.join(tmp_dir, 'test_merger_2.kv') file_3 = path.join(tmp_dir, 'test_merger_3.kv') merge_file = path.join(tmp_dir, 'merge.kv') generate_keyvi(keys_1, file_1) generate_keyvi(keys_2, file_2) generate_keyvi(keys_3, file_3) merger.Add(file_1) merger.Add(file_2) merger.Add(file_3) merger.Merge(merge_file) merged_dictionary = pykeyvi.Dictionary(merge_file) keys = set() keys.update(keys_1) keys.update(keys_2) keys.update(keys_3) keys_ordered = sorted(keys) for base_key, keyvi_key in zip(keys_ordered, merged_dictionary.GetAllKeys()): assert decode_to_unicode(base_key) == decode_to_unicode(keyvi_key) finally: shutil.rmtree(tmp_dir)
def tmp_dictionary(compiler, file_name): compiler.Compile() compiler.WriteToFile(file_name) d = pykeyvi.Dictionary(file_name) yield d del d os.remove(file_name)
def test_truncated_file_json(): c=pykeyvi.JsonDictionaryCompiler() c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile('truncation_test.kv') size = os.path.getsize('truncation_test.kv') fd_in = open('truncation_test.kv') fd = open('truncation_test1.kv', 'w') fd.write(fd_in.read(size/2)) fd.close() exception_caught = False try: d=pykeyvi.Dictionary('truncation_test1.kv') except ValueError: exception_caught = True assert exception_caught os.remove('truncation_test1.kv') os.remove('truncation_test.kv')
def test_truncated_file_json(): c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add('a', '{1:2}') c.Add('b', '{2:4}') c.Add('c', '{4:4}') c.Add('d', '{2:3}') c.Compile() c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv')) size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv')) fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb') fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb') fd.write(fd_in.read(int(size / 2))) fd.close() exception_caught = False try: d = pykeyvi.Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv')) except ValueError: exception_caught = True assert exception_caught os.remove(os.path.join(tmp_dir, 'truncation_test1.kv')) os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
def test_manifest_for_merger(): try: c = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("abc", '{"a" : 2}') c.Compile() c.SetManifest({"author": "Zapp Brannigan"}) c.WriteToFile('manifest_json_merge1.kv') del c c2 = pykeyvi.JsonDictionaryCompiler({"memory_limit_mb": "10"}) c2.Add("abd", '{"a" : 3}') c2.Compile() c2.SetManifest({"author": "Leela"}) c2.WriteToFile('manifest_json_merge2.kv') del c2 merger = pykeyvi.JsonDictionaryMerger({"memory_limit_mb": "10"}) merger.SetManifest({"author": "Fry"}) merger.Merge('manifest_json_merged.kv') d = pykeyvi.Dictionary('manifest_json_merged.kv') m = d.GetManifest() assert m['author'] == "Fry" del d finally: os.remove('manifest_json_merge1.kv') os.remove('manifest_json_merge2.kv') os.remove('manifest_json_merged.kv')
def test_merge(merger): tmp_dir = tempfile.mkdtemp() try: file_1 = path.join(tmp_dir, 'test_merger_1.kv') file_2 = path.join(tmp_dir, 'test_merger_2.kv') file_3 = path.join(tmp_dir, 'test_merger_3.kv') merge_file = path.join(tmp_dir, 'merge.kv') generate_keyvi(key_values_1, file_1) generate_keyvi(key_values_2, file_2) generate_keyvi(key_values_3, file_3) merger.Add(file_1) merger.Add(file_2) merger.Add(file_3) merger.Merge(merge_file) merged_dictionary = pykeyvi.Dictionary(merge_file) key_values = {} key_values.update(key_values_1) key_values.update(key_values_2) key_values.update(key_values_3) key_values_ordered = collections.OrderedDict(sorted( key_values.items())) for (base_key, base_value), (keyvi_key, keyvi_value) in zip( key_values_ordered.items(), merged_dictionary.GetAllItems()): assert decode_to_unicode(base_key) == decode_to_unicode(keyvi_key) assert base_value == keyvi_value finally: shutil.rmtree(tmp_dir)
def tmp_dictionary(compiler, file_name): tmp_dir = tempfile.gettempdir() fq_file_name = os.path.join(tmp_dir, file_name) compiler.Compile() compiler.WriteToFile(fq_file_name) del compiler d = pykeyvi.Dictionary(fq_file_name) yield d del d os.remove(fq_file_name)
def test_invalid_filemagic(): fd = open('broken_file','w') fd.write ('dead beef') fd.close() exception_caught = False try: d=pykeyvi.Dictionary('broken_file') except ValueError: exception_caught = True assert exception_caught os.remove('broken_file')
def dump(args): dictionary = pykeyvi.Dictionary(args.input_file) with open(args.output_file, 'w') as file_out: for key, value in dictionary.GetAllItems(): if args.json_dumps: key = json.dumps(key) if isinstance(key, bytes): key = key.decode() file_out.write(key) if value: if args.json_dumps: value = json.dumps(value) file_out.write('\t{}'.format(value)) file_out.write('\n')
def test_manifest_after_compile(): c = pykeyvi.KeyOnlyDictionaryCompiler() c.Add("Leela") c.Add("Kif") c.Compile() c.SetManifest({"author": "Zapp Brannigan"}) file_name = 'brannigan_manifest2.kv' try: c.WriteToFile(file_name) d = pykeyvi.Dictionary(file_name) m = d.GetManifest() assert m['author'] == "Zapp Brannigan" del d finally: os.remove(file_name)
def test_manifest_after_compile(): c = pykeyvi.KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"}) c.Add("Leela") c.Add("Kif") c.Compile() c.SetManifest({"author": "Zapp Brannigan"}) file_name = os.path.join(tempfile.gettempdir(), 'brannigan_manifest2.kv') try: c.WriteToFile(file_name) d = pykeyvi.Dictionary(file_name) m = d.GetManifest() assert m['author'] == "Zapp Brannigan" del d finally: os.remove(file_name)
__author__ = 'ankit' import pykeyvi import os keyviquerydatadir = "" high_freq_queries = {} for root, dirs, files in os.walk(keyviquerydatadir): path = root.split('/') for file in files: if (file.lower().startwith('query_ucrawl')): keyvifilePath = os.path.join(root,str(file)) d = pykeyvi.Dictionary(keyvifilePath) allkeys = d.GetAllKeys() for query in allkeys: query_parts = query.split(" ") if len(query_parts) >=3: #print query_parts[1:] q = query_parts[1:] if high_freq_queries.has_key(q): high_freq_queries[q] = high_freq_queries.get(q) + 1 else: high_freq_queries[q] = 1
import sys import pykeyvi d = pykeyvi.Dictionary("normalization.keyvi") n = pykeyvi.FsaTransform(d) for line in sys.stdin: print n.Normalize(line)
import msgpack import zlib import snappy import pykeyvi def decode_value(value): """Decodes a cliqztionary value.""" if value is None or len(value) == 0: return None elif len(value) > 1 and value[0] == ' ': value = zlib.decompress(value[1:]) return msgpack.loads(value) count = 0 d = pykeyvi.Dictionary('pagemodels.kv-0') items = d.GetAllItems() for key, value in items: if (count == 20): break print decode_value(value) count += 1
def stats(input_file): print (json.dumps(pykeyvi.Dictionary(input_file).GetStatistics(), indent=4, sort_keys=True))
import pykeyvi query = "" d = pykeyvi.Dictionary("your-own.keyvi") def get_lookup_key(query): return query while query != "exit": query = raw_input("Query:") for m in d.Get(get_lookup_key(query.strip())): print "{} {}".format(m.GetMatchedString(), m.GetValueAsString())
import kenlm model = kenlm.Model("/run/shm/lm_word.bin") model.perplexity("formatter python files") from rust_qpick import Qpick import time import os import pykeyvi os.environ['RUST_BACKTRACE']='full' qpick = Qpick('/root/dragan/index/') i2q = pykeyvi.Dictionary('/run/shm/i2q/test_merge.kv') s = time.time(); res = list(qpick.get('clear tcp connections linux command line', 10)); time.time() -s qp = Qpick(dir_path="/home/dnc/workspace/cliqz/qpick/index/", start_shard=10, end_shard=15) qs = [(d, i2q.get(str(id)).GetValue()) for (id, d) in res] --- supervisorctl start nmslib-10000 ... supervisorctl start nmslib-10009 --- from rust_qpick import Qpick import time import os import pykeyvi os.environ['RUST_BACKTRACE']='1' qpick = Qpick('/root/dragan/index/') i2q = pykeyvi.Dictionary('/run/shm/i2q/test_merge.kv') s = time.time(); res = list(qpick.nget( [
import pykeyvi query = "" d = pykeyvi.Dictionary("cities.keyvi") def get_lookup_key(query): return query while query != "exit": query = raw_input("Query:") for m in d.LookupText(get_lookup_key(query.strip())): print "{}".format(m.GetMatchedString())
import pykeyvi query = "" d = pykeyvi.Dictionary("prefix-completion.keyvi") c = pykeyvi.PrefixCompletion(d) def get_lookup_key(query): return query while query != "exit": query = raw_input("Query:") for m in c.GetCompletions(get_lookup_key(query.strip())): print "{} {}".format(m.GetMatchedString(), m.GetAttribute("weight"))
#!/usr/bin/python # -*- coding: UTF-8 -*- from gensim.models.doc2vec import Doc2Vec import pykeyvi docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv" output_data_path = "/raid/ankit/doc2vec/out_s_p_1M" doc2vec_trained_model = 'pages_with_spaces.doc2vec' _alpha, _min_alpha, _passes = (0.020, 0.001, 20) print "Loading keyvi dictionaries ..." keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file)) print "Finished Loading key-vi Dictionary." print "Loading Doc2Vec Model ... " model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model)) print "Model Loaded Successfully!" def get_similar_urls(sample_query, nearest_num): tokens = sample_query.lower().split() dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes) # note: may want to use many more steps than default sims = model.docvecs.most_similar(positive=[dv], topn=nearest_num) for url_id, distance in sims: url = "" for m in keyvi_dict.Get(str(url_id)): url = m.GetValueAsString() print "{}\t{}\t{}".format(url_id, url, distance) def main(): print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
import pykeyvi MULTIWORD_QUERY_SEPARATOR = '\x1b' query = "" d = pykeyvi.Dictionary("mw-completion.keyvi") c = pykeyvi.MultiWordCompletion(d) def get_lookup_key(query): l = query.split(" ") l_bow = " ".join(sorted(l[:-1]) + l[-1:]) return l_bow while query != "exit": query = raw_input("Query:") for m in c.GetCompletions(get_lookup_key(query.strip())): print "{} {}".format(m.GetMatchedString(), m.GetAttribute("weight"))