Example #1
0
def query():
    '''Run the timing numbers for each of the provided seeds for query all'''
    seeds = make_seeds()
    client = Client(args.backend, args.name, args.num_blocks, args.num_bits,
        **kwargs)
    for i in range(1000):
        if i % 25 == 0:
            print 'Querying batch %i' % i
        # We want to get a relatively good mix each time we insert data
        hashes = [(start + i * interval) for start, interval in seeds]
        try:
            results = client.find_all(hashes)
        except GeneralException as exc:
            print '---> Client exception: %s' % repr(exc)
Example #2
0
def find_duplicates(bithash_corpus, table_name, ntables, nmatchbits):

    try:
        redis_client = Client('redis', table_name, ntables, nmatchbits)
        redis_client.delete()
    except ConnectionError:
        print 'Connection error. Is redis running? Execute:\n>> redis-server'
        return -1

    # insert the first 20k documents
    redis_client.insert(bithash_corpus[:20000])

    # Find duplicates of the remaining ~2k documents
    dup_lists = redis_client.find_all(bithash_corpus[20001:])
    idxs_and_simhashs = [(20001 + idx, dup_lists[idx])
                         for idx, val in enumerate(dup_lists) if len(val) > 0]

    return idxs_and_simhashs