def benchmark_lshensemble(threshold, num_perm, num_part, m, index_data, query_data): print("Building LSH Ensemble index") (minhashes, indexed_sets, keys) = index_data lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=num_part, m=m) lsh.index((key, minhash, len(s)) for key, minhash, s in \ zip(keys, minhashes[num_perm], indexed_sets)) print("Querying") (minhashes, sets, keys) = query_data probe_times = [] process_times = [] results = [] for qs, minhash in zip(sets, minhashes[num_perm]): # Record probing time start = time.perf_counter() result = list(lsh.query(minhash, len(qs))) probe_times.append(time.perf_counter() - start) # Record post processing time. start = time.perf_counter() [_compute_containment(qs, indexed_sets[key]) for key in result] process_times.append(time.perf_counter() - start) results.append(result) sys.stdout.write("\rQueried {} sets".format(len(results))) sys.stdout.write("\n") return results, probe_times, process_times
def load_LSH(lib_profiles, mode=MODE.SCALABLE, repackage=False, processes=None): """Load library profiles to an LSH object. Args: lib_profiles (list): The list of library profiles. mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details. repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used. """ global LSH, LIB_RELATIONSHIP_GRAPHS weights = (0.5, 0.5) if repackage else (0.1, 0.9) LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD, num_perm=LSH_PERM_NUM, num_part=32, weights=weights) (minhash_list, LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile( lib_profiles=lib_profiles, mode=mode, repackage=repackage, processes=processes) LOGGER.info("Start indexing LSH (this could take a while) ...") start_time = time.time() LSH.index(minhash_list) end_time = time.time() LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
def cluster_obs_group(self, candidates): ensemble = MinHashLSHEnsemble(threshold=0.95, num_perm=128) ensemble.index((c,) + self.hashes[c] for c in candidates if self.hashes[c][1] > 0) clusters = [] while candidates: rep = candidates.pop() clus = [rep] h, l = self.hashes[rep] if l == 0: # An empty representative will cause division by # zero in ensemble.query(); instead, group it with # all the other empty candidates, and no others. for other in list(candidates): if self.hashes[other][1] == 0: clus.append(other) candidates.discard(other) else: for other in ensemble.query(h, l): if other in candidates: clus.append(other) candidates.discard(other) clusters.append(clus) return clusters
def benchmark_lshensemble(threshold, num_perm, num_part, l, index_data, query_data): print("Building LSH Ensemble index") lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=num_part, l=l) lsh.index((key, minhash, len(set)) for key, minhash, set in \ zip(index_data.keys, index_data.minhashes[num_perm], index_data.sets)) print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = list(lsh.query(minhash, len(qs))) duration = time.clock() - start times.append(duration) results.append( sorted([[key, _compute_containment(qs, index_data.sets[key])] for key in result], key=lambda x: x[1], reverse=True)) return times, results
def exec_comparison_heuristic(self, source_func: BDFunction, target_func: BDFunction) \ -> bool: # Populate the attribute values source_hash = self.loaded_attributes[ 'FunctionMinHashLSH'].extract_attribute(source_func) target_hash = self.loaded_attributes[ 'FunctionMinHashLSH'].extract_attribute(target_func) # Create an LSH Ensemble index with threshold and number of partition # settings. lshensemble = MinHashLSHEnsemble( threshold=Configuration.MINHASH_LSH_ENSEMBLE_THRESHOLD, num_perm=Configuration.MINHASH_PERMUTATIONS, num_part=Configuration.MINHASH_LSH_ENSEMBLE_PARTITIONS) lshensemble.index([("source_function", source_hash, len()), ("m3", m3, len(set3))]) if source_hash == target_hash: return True return False
# Create MinHash objects m1 = MinHash(num_perm=128) m2 = MinHash(num_perm=128) m3 = MinHash(num_perm=128) for d in set1: m1.update(d.encode('utf8')) for d in set2: m2.update(d.encode('utf8')) for d in set3: m3.update(d.encode('utf8')) # Create an LSH Ensemble index with a threshold lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128) # Index takes an iterable of (key, minhash, size) lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))]) # Check for membership using the key print("m2" in lshensemble) print("m3" in lshensemble) # Using m1 as the query, get an result iterator print("Sets with containment > 0.2:") for key in lshensemble.query(m1, len(set1)): print(key) from datasketch import HyperLogLog, HyperLogLogPlusPlus data1 = [ 'hyperloglog', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'cardinality', 'of', 'dataset', 'dataset', 'a'
class Sto: def __init__(self, value_format: str = 'h', threshold: float = 0.8, num_perm: int = 128, num_part: int = 32, tokenizer: Tokenizer = Tokenizer('zh')): self.value_format = value_format self.threshold = threshold self.num_perm = num_perm self.num_part = num_part self.tokenizer = tokenizer self.lsh = MinHashLSHEnsemble(threshold=self.threshold, num_perm=self.num_perm) self.record_dawg = dawg.RecordDAWG(self.value_format) def __check_get_store_path(self, data_path: Path): pnlp.check_dir(data_path) lsh_path = Path(data_path) / "lsh.pickle" dawg_path = Path(data_path) / "values.dawg" return lsh_path, dawg_path def load(self, data_path: Path): lsh_path, dawg_path = self.__check_get_store_path(data_path) if lsh_path.exists(): self.lsh = load_pickle(lsh_path) else: raise ValueError("lsh pickle: {} not exist.".format(lsh_path)) if dawg_path.exists(): self.record_dawg.load(str(dawg_path)) else: raise ValueError("dawg file: {} not exist.".format(dawg_path)) def store(self, data_path: Path): lsh_path, dawg_path = self.__check_get_store_path(data_path) dump_pickle(lsh_path, self.lsh) self.record_dawg.save(dawg_path) def __check_value_format(self, val: tuple): if len(val) != len(self.value_format): raise ValueError( "value format {} does not match the value {}".format( self.value_format, val)) def add(self, text_list: List[str], value_list: List[tuple]): len_text = len(text_list) len_value = len(value_list) assert len_text == len_value data = {} entries = [] for i, text in enumerate(text_list): entry = self.text_to_lsh_entry(text) key = entry[0] if key in data: continue value = value_list[i] self.__check_value_format(value) data[key] = value entries.append(entry) self.lsh.index(entries) self.record_dawg = dawg.RecordDAWG(self.value_format, data.items()) def query(self, text: str): key, mh, length = self.text_to_lsh_entry(text) if key in self.record_dawg: return self.record_dawg.get(key)[0] for sim_key in self.lsh.query(mh, length): return self.record_dawg.get(sim_key)[0] else: return def text_to_lsh_entry(self, text: str): words = self.tokenizer(text) bigrams = list(ngrams(words, 1, 2)) wset = set(bigrams) mh = MinHash(num_perm=self.num_perm) for w in wset: mh.update(w.encode('utf8')) unicode_hash = hashlib.sha1(text.encode("utf8")).hexdigest() return (unicode_hash, mh, len(wset)) def __getitem__(self, key: str): return self.query(key) def __setitem__(self, key: str, value: tuple): raise NotImplementedError def __contains__(self, key: str): if self.query(key): return True return False def __len__(self): return len(self.record_dawg.keys())
def main(): ss = int(sys.argv[1]) filelist = sys.argv[2] GB = 1024 * 1024 * 1024 t0 = time() if '--profile' in sys.argv or '--timing' in sys.argv: types, labels = defaultdict(set), defaultdict( set) # save time for profiling test else: print('reading typeslabels.pkl ..', file=sys.stderr) f = open('typeslabels.pkl', 'rb') types, labels = pickle.load(f) f.close() t1 = time() print('labels in dbpedia and wikidata:', len(types), file=sys.stderr) print('labels read in %.1f seconds or %.1f minutes' % (t1 - t0, (t1 - t0) / 60), file=sys.stderr) # sys.getsizeof does not work correctly, and pympler is WAY too slow # print('sizeof types %.3f GB sizeof labels %.3f GB' % (sys.getsizeof(types)/GB, sys.getsizeof(labels)/GB,), file=sys.stderr) files = [f.strip() for f in open(filelist).readlines()] # process all files instead of sample? if ss == 0: ss = len(files) nrows = np.zeros(ss) ncols = np.zeros(ss) cols = {} svals = 0 ne = 0 rels = {} titles = {} typcnt = defaultdict(int) usecnt = defaultdict(int) vcnt = defaultdict(int) typsets = {} tabs = {} nval = {} stats = {} hdr = 0 rtyp = defaultdict(int) # consistent row types # pat = re.compile("^[0-9\.,/Q_-]*$") print('reading', ss, 'out of total', len(files), 'files from', filelist, file=sys.stderr) # random.seed(4713) sam = random.sample(files, ss) # sqlite default autocommit, even when scripted # we need one big transaction # otherwise autocommit --> each statement a separate transaction, VERY slow fout.write("begin transaction;\n") fout.write('DROP TABLE IF EXISTS val;\n') fout.write('DROP TABLE IF EXISTS tab;\n') fout.write('DROP TABLE IF EXISTS col;\n') fout.write('DROP TABLE IF EXISTS err;\n') fout.write('DROP TABLE IF EXISTS sel;\n') fout.write('DROP TABLE IF EXISTS sub;\n') fout.write('DROP TABLE IF EXISTS vcnt;\n') fout.write('DROP TABLE IF EXISTS lsh;\n') fout.write('DROP TABLE IF EXISTS cnts;\n') fout.write( 'CREATE TABLE val (desc varchar, nvals int, wtyp int, frac float);\n') fout.write( 'CREATE TABLE vcnt (val varchar, nv int, nt int, typs varchar);\n') fout.write( 'CREATE TABLE tab (id int, rows int, cols int, head int, fn varchar, src varchar);\n' ) fout.write( 'CREATE TABLE col (tab int, col int, typ varchar(255), frac float, cov float);\n' ) fout.write( 'CREATE TABLE err (typ varchar, msg varchar, fn varchar, col int, src varchar);\n' ) fout.write( 'CREATE TABLE sel (tab int, col int, nval int, ndist int, sel int);\n') fout.write( 'CREATE TABLE sub (l int, j int, k int, i int, comtyp int, fracsubset float, nchild int, nparent int, fnchild varchar, fnparent varchar);\n' ) fout.write('CREATE TABLE lsh (k int, i int, l int, j int);\n') fout.write('CREATE TABLE cnts (desc varchar, cnt int);\n') # file for writing types of columns tsfile = open('typesets.csv', 'w', encoding='utf8', errors='ignore') tsfile.write('filename,column,fraction,type\n') fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('CSV Files:', ss)) ne = 0 nallcols = 0 # number of all columns # go through the files in the sample for k in range(ss): fn = sam[k] src = 'obd' # fn[6:9] # if src in ['kag', 'obd']: ftyp = 'csv' # else: ftyp = 'json' res = readf(fn, 'csv') # ftyp) if res == -1: ne += 1 continue # error reading file cols, h, nrows, ncols = res nallcols += len(cols) # # kaggle filenames contain ' !!! fout.write("insert into tab values (%d, %d, %d, %d, '%s', '%s');\n" % (k, nrows, ncols, h, cleanfn(fn), src)) nval[k] = { } # lengths of columns as lists, after removing ignored elements tabs[k] = {} # sets of col values, after removing ignored typsets[k] = {} stats[k] = {} ign = ('null', 'true', 'false', 't', 'f', 'yes', 'no', 'y', 'n', 'none', 'na', 'n/a', 'nan', 'n.a.', 'male', 'female', 'm', 'f', 'e') for i in range(len(cols)): # remove numeric and null-like lst = [ x.strip() for x in cols[i] if len(x.strip()) > 0 and not isnumber(x.strip()) and not x.strip().lower() in ign ] if len(lst) == 0: err('col', 'all num or null', fn, i) continue # log value counts for x in lst: vcnt[x] += 1 # only need list length and set for finding reference cols, not whole list: save lots of memory nval[k][i] = len(lst) tabs[k][i] = set(lst) # selectivity sel = len(tabs[k][i]) / len(lst) fout.write( "insert into sel (tab, col, nval, ndist, sel) values(%d, %d, %d, %d, %f);\n" % (k, i, len(lst), len(tabs[k][i]), sel)) # None types tsets = [types[x] for x in lst] ts = set.union(*tsets) typsets[k][i] = set() # all stats useless, performance worse if '--stats' in sys.argv: stats[k][i] = {} lens = [len(x) for x in slst] stats[k][i]['lmin'] = min(lens) stats[k][i]['lmax'] = max(lens) stats[k][i]['case'] = getcase(slst) stats[k][i]['alph'] = getalph(slst) for t in ts: # for more than one known value of type t: fraction of column values that are of type t if len(labels[t]) < 2: continue f = sum([int(t in s) for s in tsets]) / len(lst) # type coverage: fraction of col values of type t in relation to all known values of this type tset = set([x for x in lst if t in types[x]]) # col vals of type t cov = len(tset) / len(labels[t]) fout.write( "insert into col (tab, col, typ, frac, cov) values (%d, %d, '%s', %f, %f);\n" % (k, i, t.replace("'", "").replace('"', ''), f, cov)) if f >= 0.5: typsets[k][i].add(t) if t != 'www.w3.org/2002/07/owl#Thing': # column i in file fn has type t for at least fraction f of elements (set) tsfile.write("%s,%d,%.3f,%s\n" % (fn, i, f, t)) typsets[k][i].discard('www.w3.org/2002/07/owl#Thing' ) # messes up results. its always a thing t2 = time() print('files read in %.1f seconds or %.1f minutes' % (t2 - t1, (t2 - t1) / 60), file=sys.stderr) # some statistics on columns and values (in the sets): number columns, number of values, length of values nc = sum([len(tabs[k]) for k in tabs]) nv = sum([len(tabs[k][i]) for k in tabs for i in tabs[k]]) lv = sum([len(s) for k in tabs for i in tabs[k] for s in tabs[k][i]]) print( 'number of tables: %d cols: %d avg col size (set): %.1f avg item len: %.1f' % (len(tabs), nc, nv / nc, lv / nv)) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('Import Errors:', ne)) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('All Tables:', len(tabs))) fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" % ('All Columns:', nallcols)) # print("brk:", brk) # no gain # print('sizeof tabs %.3f GB sizeof typsets %.3f GB' % (sys.getsizeof(tabs)/GB, sys.getsizeof(typsets)/GB,), file=sys.stderr) # value and type counts, most frequent only for k in sorted(vcnt, key=vcnt.get, reverse=True)[:100]: fout.write( "insert into vcnt (val, nv, nt, typs) values ('%s', %d, %d, '%s');\n" % (k.replace("'", ""), vcnt[k], len(types[k]), ', '.join( list(types[k])[:5]))) # overall statistics on values and types nv, wt = sum([vcnt[k] for k in vcnt ]), sum([vcnt[k] * int(len(types[k]) > 0) for k in vcnt]) fout.write( "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n" % ('Values', nv, wt, wt / nv)) uv, wt = len(vcnt), sum([int(len(types[k]) > 0) for k in vcnt]) fout.write( "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n" % ('Unique', uv, wt, wt / uv)) # if '--refs' in sys.argv: print("finding reference columns..", file=sys.stderr) num_cpus = 8 # psutil.cpu_count(logical=False) # weird problems with that print("cpus:", num_cpus) print("init ray..", file=sys.stderr) # Starting Ray with .. GiB memory available for workers and up to .. GiB for objects. # ray.init(memory=<bytes>, object_store_memory=<bytes>). ray.init(num_cpus=num_cpus, memory=9 * 1024 * 1024 * 1024, object_store_memory=45 * 1024 * 1024 * 1024) print("put data into shared mem..", file=sys.stderr) tabs_id = ray.put(tabs) nval_id = ray.put(nval) stats_id = ray.put(stats) typsets_id = ray.put(typsets) sam_id = ray.put(sam) print("start parallel..", file=sys.stderr) # split task by assigning lists of keys in tabs to check. this will block until all are ready. sql = ray.get([ refcols.remote(tabs_id, nval_id, stats_id, typsets_id, sam_id, tx) for tx in np.array_split(list(tabs.keys()), num_cpus) ]) print("parallel section done.", file=sys.stderr) # write to file here, NOT in individual tasks for s in sql: fout.write(s) t3 = time() print('references done in %.1f seconds or %.1f minutes' % (t3 - t2, (t3 - t2) / 60), file=sys.stderr) # if '--lsh' in sys.argv: # locality sensitive hashing for minHashes of column value sets print('build hashes..', file=sys.stderr) # lsh = MinHashLSH(threshold=0.75, num_perm=128) lsh = MinHashLSHEnsemble(threshold=0.9, num_perm=128, num_part=32) mh = {} for k in tabs: mh[k] = {} for i in tabs[k]: mh[k][i] = MinHash(num_perm=128) for d in tabs[k][i]: mh[k][i].update(d.encode('utf8')) # lsh.insert((k,i), mh[k][i]) lsh.index([((k, i), mh[k][i], len(tabs[k][i])) for k in tabs for i in tabs[k]]) t4 = time() print('hashes built in %.1f seconds or %.1f minutes' % (t4 - t3, (t4 - t3) / 60), file=sys.stderr) # similar cols according to lsh print("query lsh..") for k in tabs: for i in tabs[k]: # for l, j in lsh.query(mh[k][i]): for l, j in lsh.query(mh[k][i], len(tabs[k][i])): if l == k: continue # same table # l,j is the parent! # min size for ref tab if len(tabs[l][j]) < 10: continue # selectivity must be 1 if len(tabs[l][j]) != nval[l][j]: continue if len(tabs[k][i]) < 10 and len(tabs[l][j]) < 10: print("TABS", k, i, tabs[k][i]) print("SIMT", l, j, tabs[l][j]) fout.write( "insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (l, j, k, i)) else: t4 = time() # need content for query? # fout.write("insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (0, 0, 0, 0)) t5 = time() print('hash queries done in %.1f seconds or %.1f minutes' % (t5 - t4, (t5 - t4) / 60), file=sys.stderr) # fout.write("commit;\n") fout.close() tsfile.close() print('total run time %.1f seconds or %.1f minutes' % (t5 - t0, (t5 - t0) / 60), file=sys.stderr) sys.exit(0)
from datasketch import MinHash, MinHashLSH, MinHashLSHEnsemble data1 = ['这个', '程序', '代码', '太乱', '那个', '代码', '规范'] data2 = ['这个', '程序', '代码', '不', '规范', '那个', '更', '规范'] data3 = ['这个', '程序', '代码', '不', '规范', '那个', '规范', '些'] # 创建MinHash对象 m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # 创建LSH Ensemble lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128) # Index takes an iterable of (key, minhash, size) lshensemble.index([("m2", m2, len(data2)), ("m3", m3, len(data3))]) # 判断lshensemble是否存在m2, m3 print("m2" in lshensemble) print("m3" in lshensemble) # 查询与m1相似度大于0.8的集合 print("与m1相似度大于0.8的集合:") for key in lshensemble.query(m1, len(data1)): print(key)