def benchmark_ground_truth(threshold, index_data, query_data): times = [] results = [] for q in query_data.sets: start = time.clock() result = [] for key, a in zip(index_data.keys, index_data.sets): j = _compute_jaccard(q, a) if j >= threshold: result.append([key, j]) duration = time.clock() - start results.append(sorted(result, key=lambda x : x[1], reverse=True)) times.append(duration) return times, results
def benchmark_linearscan(num_perm, threshold, index_data, query_data): times = [] results = [] for qs, q in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = [] for key, m in zip(index_data.keys, index_data.minhashes[num_perm]): j = q.jaccard(m) if j >= threshold: result.append(key) duration = time.clock() - start times.append(duration) results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x : x[1], reverse=True)) return times, results
def benchmark_lsh(num_perm, threshold, index_data, query_data): print("Building LSH index") lsh = MinHashLSH(threshold, num_perm) for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]): lsh.insert(key, minhash) print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x : x[1], reverse=True)) return times, results
def benchmark_lsh(num_perm, threshold, index_data, query_data): print("Building LSH index") lsh = MinHashLSH(threshold, num_perm) for key, minhash in z:xip(index_data.keys, index_data.minhashes[num_perm]): lsh.insert(key, minhash) print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x : x[1], reverse=True)) return times, results def benchmark_linearscan(num_perm, threshold, index_data, query_data): times = [] results = [] for qs, q in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = [] for key, m in zip(index_data.keys, index_data.minhashes[num_perm]): j = q.jaccard(m) if j >= threshold: result.append(key)