def benchmark_ground_truth(threshold, index_data, query_data):
    times = []
    results = []
    for q in query_data.sets:
        start = time.clock()
        result = []
        for key, a in zip(index_data.keys, index_data.sets):
            j = _compute_jaccard(q, a)
            if j >= threshold:
                result.append([key, j])
        duration = time.clock() - start
        results.append(sorted(result, key=lambda x : x[1], reverse=True))
        times.append(duration)
    return times, results
def benchmark_linearscan(num_perm, threshold, index_data, query_data):
    times = []
    results = []
    for qs, q in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = []
        for key, m in zip(index_data.keys, index_data.minhashes[num_perm]):
            j = q.jaccard(m)
            if j >= threshold:
                result.append(key)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result], 
                              key=lambda x : x[1], reverse=True))
    return times, results
Exemple #3
0
def benchmark_lsh(num_perm, threshold, index_data, query_data):
    print("Building LSH index")
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result], 
                              key=lambda x : x[1], reverse=True))
    return times, results

def benchmark_lsh(num_perm, threshold, index_data, query_data):
    print("Building LSH index")
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in z:xip(index_data.keys, index_data.minhashes[num_perm]):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result], 
                              key=lambda x : x[1], reverse=True))
    return times, results


def benchmark_linearscan(num_perm, threshold, index_data, query_data):
    times = []
    results = []
    for qs, q in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = []
        for key, m in zip(index_data.keys, index_data.minhashes[num_perm]):
            j = q.jaccard(m)
            if j >= threshold:
                result.append(key)