def generate_minhashes(input_file, bbox, memory, n, k): hashes = [lsh.hashFamily(i) for i in xrange(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.clock() tcs_minhashes = { tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items() } mh_time = time.clock() - mh_t ptime_start = time.clock() else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.clock() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.clock() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) ptime_start = time.clock() tcs_minhashes, _ = loadSignatures(sigfile) return hashes, tcs_minhashes, mh_time, ptime_start
def fast_(input_file, selsize, r, b, bbox=False, k=5, memory=False, B=0): """INPUT (str)input_file: path of input file (fun)selsize: size of candidate set (int)r: number of rows (int)b: number of bands (bool)bbox: True if BB prioritization (int)k: k-shingle size (for BB prioritization) (bool)memory: if True keep signature in memory and do not store them to file OUTPUT (list)P: prioritized test suite """ n = r * b # number of hash functions hashes = [lsh.hashFamily(i) for i in range(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.perf_counter() tcs_minhashes = { tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items() } mh_time = time.perf_counter() - mh_t else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.perf_counter() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.perf_counter() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) tcs_minhashes, load_time = loadSignatures(sigfile) ptime_start = time.perf_counter() tcs = set(tcs_minhashes.keys()) # budget B modification if B == 0: B = len(tcs) BASE = 0.5 SIZE = int(len(tcs) * BASE) + 1 bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) prioritized_tcs = [0] # First TC selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) first_tc = random.choice(list(tcs_minhashes.keys())) for i in range(n): if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[first_tc][i] prioritized_tcs.append(first_tc) tcs -= set([first_tc]) del tcs_minhashes[first_tc] iteration, total = 0, float(len(tcs_minhashes)) while len(tcs_minhashes) > 0: iteration += 1 if iteration % 100 == 0: sys.stdout.write(" Progress: {}%\r".format( round(100 * iteration / total, 2))) sys.stdout.flush() if len(tcs_minhashes) < SIZE: bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) SIZE = int(SIZE * BASE) + 1 sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: candidates = tcs_minhashes.keys() to_sel = min(selsize(len(candidates)), len(candidates)) selected_tc_set = random.sample(tuple(candidates), to_sel) for selected_tc in selected_tc_set: for i in range(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) # select budget B if len(prioritized_tcs) >= B + 1: break tcs -= set([selected_tc]) del tcs_minhashes[selected_tc] # select budget B if len(prioritized_tcs) >= B + 1: break ptime = time.perf_counter() - ptime_start max_ts_size = sum((1 for line in open(input_file))) return mh_time, ptime, prioritized_tcs[1:max_ts_size]
def fast_(input_file, wBoxFile, selsize, r, b, bbox=False, k=5, memory=False): n = r * b # number of hash functions tC0 = time.clock() C = loadCoverage(wBoxFile) tC1 = time.clock() maxCov = reduce(lambda x, y: x | y, C.values()) hashes = [lsh.hashFamily(i) for i in range(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.clock() tcs_minhashes = { tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items() } mh_time = time.clock() - mh_t ptime_start = time.clock() else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.clock() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.clock() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) ptime_start = time.clock() tcs_minhashes, load_time = loadSignatures(sigfile) tcs = set(tcs_minhashes.keys()) BASE = 0.5 SIZE = int(len(tcs) * BASE) + 1 bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) prioritized_tcs = [0] # First TC selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) first_tc = random.choice(list(tcs_minhashes.keys())) for i in range(n): if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[first_tc][i] prioritized_tcs.append(first_tc) cov = C[first_tc] for tc in C.keys(): C[tc] = C[tc] - cov if tc in tcs and len(C[tc]) == 0: tcs -= set([tc]) del tcs_minhashes[tc] iteration, total = 0, float(len(tcs_minhashes)) while cov != maxCov: iteration += 1 if iteration % 100 == 0: sys.stdout.write(" Progress: {}%\r".format( round(100 * iteration / total, 2))) sys.stdout.flush() if len(tcs_minhashes) < SIZE: bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) SIZE = int(SIZE * BASE) + 1 sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: candidates = tcs_minhashes.keys() to_sel = min(selsize(len(candidates)), len(candidates)) selected_tc_set = random.sample(tuple(candidates), to_sel) for selected_tc in selected_tc_set: for i in range(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) cov = cov | C[selected_tc] for tc in C.keys(): C[tc] = C[tc] - cov if tc in tcs and len(C[tc]) == 0: tcs -= set([tc]) del tcs_minhashes[tc] ptime = time.clock() - ptime_start max_ts_size = sum((1 for line in open(input_file))) return mh_time, tC1 - tC0, ptime, prioritized_tcs[1:max_ts_size]
def fast_pw(input_file, r, b, bbox=False, k=5, memory=False, B=0): n = r * b # number of hash functions hashes = [lsh.hashFamily(i) for i in range(n)] if memory: test_suite = loadTestSuite(input_file, bbox=bbox, k=k) # generate minhashes signatures mh_t = time.perf_counter() tcs_minhashes = {tc[0]: lsh.tcMinhashing(tc, hashes) for tc in test_suite.items()} mh_time = time.perf_counter() - mh_t ptime_start = time.perf_counter() else: # loading input file and generating minhashes signatures sigfile = input_file.replace(".txt", ".sig") sigtimefile = "{}_sigtime.txt".format(input_file.split(".")[0]) if not os.path.exists(sigfile): mh_t = time.perf_counter() storeSignatures(input_file, sigfile, hashes, bbox, k) mh_time = time.perf_counter() - mh_t with open(sigtimefile, "w") as fout: fout.write(repr(mh_time)) else: with open(sigtimefile, "r") as fin: mh_time = eval(fin.read().replace("\n", "")) ptime_start = time.perf_counter() tcs_minhashes, load_time = loadSignatures(sigfile) tcs = set(tcs_minhashes.keys()) # budget B modification if B == 0: B = len(tcs) BASE = 0.5 SIZE = int(len(tcs)*BASE) + 1 bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) prioritized_tcs = [0] # First TC selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) first_tc = random.choice(list(tcs_minhashes.keys())) for i in range(n): if tcs_minhashes[first_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[first_tc][i] prioritized_tcs.append(first_tc) tcs -= set([first_tc]) del tcs_minhashes[first_tc] iteration, total = 0, float(len(tcs_minhashes)) while len(tcs_minhashes) > 0: iteration += 1 if iteration % 100 == 0: sys.stdout.write(" Progress: {}%\r".format( round(100*iteration/total, 2))) sys.stdout.flush() if len(tcs_minhashes) < SIZE: bucket = lsh.LSHBucket(tcs_minhashes.items(), b, r, n) SIZE = int(SIZE*BASE) + 1 sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: selected_tcs_minhash = lsh.tcMinhashing((0, set()), hashes) sim_cand = lsh.LSHCandidates(bucket, (0, selected_tcs_minhash), b, r, n) filtered_sim_cand = sim_cand.difference(prioritized_tcs) candidates = tcs - filtered_sim_cand if len(candidates) == 0: candidates = tcs_minhashes.keys() selected_tc, max_dist = random.choice(tuple(candidates)), -1 for candidate in tcs_minhashes: if candidate in candidates: dist = lsh.jDistanceEstimate( selected_tcs_minhash, tcs_minhashes[candidate]) if dist > max_dist: selected_tc, max_dist = candidate, dist for i in range(n): if tcs_minhashes[selected_tc][i] < selected_tcs_minhash[i]: selected_tcs_minhash[i] = tcs_minhashes[selected_tc][i] prioritized_tcs.append(selected_tc) # select budget B if len(prioritized_tcs) >= B+1: break tcs -= set([selected_tc]) del tcs_minhashes[selected_tc] ptime = time.perf_counter() - ptime_start max_ts_size = sum((1 for line in open(input_file))) return mh_time, ptime, prioritized_tcs[1:max_ts_size]