def query(): '''Run the timing numbers for each of the provided seeds for query all''' seeds = make_seeds() client = Client(args.backend, args.name, args.num_blocks, args.num_bits, **kwargs) for i in range(1000): if i % 25 == 0: print 'Querying batch %i' % i # We want to get a relatively good mix each time we insert data hashes = [(start + i * interval) for start, interval in seeds] try: results = client.find_all(hashes) except GeneralException as exc: print '---> Client exception: %s' % repr(exc)
def find_duplicates(bithash_corpus, table_name, ntables, nmatchbits): try: redis_client = Client('redis', table_name, ntables, nmatchbits) redis_client.delete() except ConnectionError: print 'Connection error. Is redis running? Execute:\n>> redis-server' return -1 # insert the first 20k documents redis_client.insert(bithash_corpus[:20000]) # Find duplicates of the remaining ~2k documents dup_lists = redis_client.find_all(bithash_corpus[20001:]) idxs_and_simhashs = [(20001 + idx, dup_lists[idx]) for idx, val in enumerate(dup_lists) if len(val) > 0] return idxs_and_simhashs