コード例 #1
0
def benchmark_lshensemble(threshold, num_perm, num_part, m, index_data,
                          query_data):
    print("Building LSH Ensemble index")
    (minhashes, indexed_sets, keys) = index_data
    lsh = MinHashLSHEnsemble(threshold=threshold,
                             num_perm=num_perm,
                             num_part=num_part,
                             m=m)
    lsh.index((key, minhash, len(s))
            for key, minhash, s in \
                    zip(keys, minhashes[num_perm], indexed_sets))
    print("Querying")
    (minhashes, sets, keys) = query_data
    probe_times = []
    process_times = []
    results = []
    for qs, minhash in zip(sets, minhashes[num_perm]):
        # Record probing time
        start = time.perf_counter()
        result = list(lsh.query(minhash, len(qs)))
        probe_times.append(time.perf_counter() - start)
        # Record post processing time.
        start = time.perf_counter()
        [_compute_containment(qs, indexed_sets[key]) for key in result]
        process_times.append(time.perf_counter() - start)
        results.append(result)
        sys.stdout.write("\rQueried {} sets".format(len(results)))
    sys.stdout.write("\n")
    return results, probe_times, process_times
コード例 #2
0
ファイル: LibID.py プロジェクト: ucam-cl-dtg/LibID
def load_LSH(lib_profiles,
             mode=MODE.SCALABLE,
             repackage=False,
             processes=None):
    """Load library profiles to an LSH object.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
    """

    global LSH, LIB_RELATIONSHIP_GRAPHS

    weights = (0.5, 0.5) if repackage else (0.1, 0.9)
    LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD,
                             num_perm=LSH_PERM_NUM,
                             num_part=32,
                             weights=weights)

    (minhash_list,
     LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile(
         lib_profiles=lib_profiles,
         mode=mode,
         repackage=repackage,
         processes=processes)

    LOGGER.info("Start indexing LSH (this could take a while) ...")

    start_time = time.time()
    LSH.index(minhash_list)
    end_time = time.time()

    LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
コード例 #3
0
    def cluster_obs_group(self, candidates):
        ensemble = MinHashLSHEnsemble(threshold=0.95, num_perm=128)
        ensemble.index((c,) + self.hashes[c]
                       for c in candidates
                       if self.hashes[c][1] > 0)

        clusters = []
        while candidates:
            rep = candidates.pop()
            clus = [rep]
            h, l = self.hashes[rep]
            if l == 0:
                # An empty representative will cause division by
                # zero in ensemble.query(); instead, group it with
                # all the other empty candidates, and no others.
                for other in list(candidates):
                    if self.hashes[other][1] == 0:
                        clus.append(other)
                        candidates.discard(other)
            else:
                for other in ensemble.query(h, l):
                    if other in candidates:
                        clus.append(other)
                        candidates.discard(other)
            clusters.append(clus)
        return clusters
コード例 #4
0
def benchmark_lshensemble(threshold, num_perm, num_part, l, index_data,
                          query_data):
    print("Building LSH Ensemble index")
    lsh = MinHashLSHEnsemble(threshold=threshold,
                             num_perm=num_perm,
                             num_part=num_part,
                             l=l)
    lsh.index((key, minhash, len(set))
                  for key, minhash, set in \
                          zip(index_data.keys, index_data.minhashes[num_perm], index_data.sets))
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = list(lsh.query(minhash, len(qs)))
        duration = time.clock() - start
        times.append(duration)
        results.append(
            sorted([[key, _compute_containment(qs, index_data.sets[key])]
                    for key in result],
                   key=lambda x: x[1],
                   reverse=True))
    return times, results
コード例 #5
0
    def exec_comparison_heuristic(self, source_func: BDFunction, target_func: BDFunction) \
            -> bool:

        # Populate the attribute values
        source_hash = self.loaded_attributes[
            'FunctionMinHashLSH'].extract_attribute(source_func)

        target_hash = self.loaded_attributes[
            'FunctionMinHashLSH'].extract_attribute(target_func)

        # Create an LSH Ensemble index with threshold and number of partition
        # settings.
        lshensemble = MinHashLSHEnsemble(
            threshold=Configuration.MINHASH_LSH_ENSEMBLE_THRESHOLD,
            num_perm=Configuration.MINHASH_PERMUTATIONS,
            num_part=Configuration.MINHASH_LSH_ENSEMBLE_PARTITIONS)

        lshensemble.index([("source_function", source_hash, len()),
                           ("m3", m3, len(set3))])

        if source_hash == target_hash:
            return True

        return False
コード例 #6
0
# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))

# Create an LSH Ensemble index with a threshold
lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128)

# Index takes an iterable of (key, minhash, size)
lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))])

# Check for membership using the key
print("m2" in lshensemble)
print("m3" in lshensemble)

# Using m1 as the query, get an result iterator
print("Sets with containment > 0.2:")
for key in lshensemble.query(m1, len(set1)):
    print(key)

from datasketch import HyperLogLog, HyperLogLogPlusPlus

data1 = [
    'hyperloglog', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
    'estimating', 'the', 'cardinality', 'of', 'dataset', 'dataset', 'a'
コード例 #7
0
ファイル: __init__.py プロジェクト: hscspring/sto
class Sto:
    def __init__(self,
                 value_format: str = 'h',
                 threshold: float = 0.8,
                 num_perm: int = 128,
                 num_part: int = 32,
                 tokenizer: Tokenizer = Tokenizer('zh')):
        self.value_format = value_format
        self.threshold = threshold
        self.num_perm = num_perm
        self.num_part = num_part
        self.tokenizer = tokenizer
        self.lsh = MinHashLSHEnsemble(threshold=self.threshold,
                                      num_perm=self.num_perm)
        self.record_dawg = dawg.RecordDAWG(self.value_format)

    def __check_get_store_path(self, data_path: Path):
        pnlp.check_dir(data_path)
        lsh_path = Path(data_path) / "lsh.pickle"
        dawg_path = Path(data_path) / "values.dawg"
        return lsh_path, dawg_path

    def load(self, data_path: Path):
        lsh_path, dawg_path = self.__check_get_store_path(data_path)
        if lsh_path.exists():
            self.lsh = load_pickle(lsh_path)
        else:
            raise ValueError("lsh pickle: {} not exist.".format(lsh_path))
        if dawg_path.exists():
            self.record_dawg.load(str(dawg_path))
        else:
            raise ValueError("dawg file: {} not exist.".format(dawg_path))

    def store(self, data_path: Path):
        lsh_path, dawg_path = self.__check_get_store_path(data_path)
        dump_pickle(lsh_path, self.lsh)
        self.record_dawg.save(dawg_path)

    def __check_value_format(self, val: tuple):
        if len(val) != len(self.value_format):
            raise ValueError(
                "value format {} does not match the value {}".format(
                    self.value_format, val))

    def add(self, text_list: List[str], value_list: List[tuple]):
        len_text = len(text_list)
        len_value = len(value_list)
        assert len_text == len_value
        data = {}
        entries = []
        for i, text in enumerate(text_list):
            entry = self.text_to_lsh_entry(text)
            key = entry[0]
            if key in data:
                continue
            value = value_list[i]
            self.__check_value_format(value)
            data[key] = value
            entries.append(entry)
        self.lsh.index(entries)
        self.record_dawg = dawg.RecordDAWG(self.value_format, data.items())

    def query(self, text: str):
        key, mh, length = self.text_to_lsh_entry(text)
        if key in self.record_dawg:
            return self.record_dawg.get(key)[0]

        for sim_key in self.lsh.query(mh, length):
            return self.record_dawg.get(sim_key)[0]
        else:
            return

    def text_to_lsh_entry(self, text: str):
        words = self.tokenizer(text)
        bigrams = list(ngrams(words, 1, 2))
        wset = set(bigrams)
        mh = MinHash(num_perm=self.num_perm)
        for w in wset:
            mh.update(w.encode('utf8'))
        unicode_hash = hashlib.sha1(text.encode("utf8")).hexdigest()
        return (unicode_hash, mh, len(wset))

    def __getitem__(self, key: str):
        return self.query(key)

    def __setitem__(self, key: str, value: tuple):
        raise NotImplementedError

    def __contains__(self, key: str):
        if self.query(key):
            return True
        return False

    def __len__(self):
        return len(self.record_dawg.keys())
コード例 #8
0
def main():
    ss = int(sys.argv[1])
    filelist = sys.argv[2]
    GB = 1024 * 1024 * 1024
    t0 = time()
    if '--profile' in sys.argv or '--timing' in sys.argv:
        types, labels = defaultdict(set), defaultdict(
            set)  # save time for profiling test
    else:
        print('reading typeslabels.pkl ..', file=sys.stderr)
        f = open('typeslabels.pkl', 'rb')
        types, labels = pickle.load(f)
        f.close()
    t1 = time()
    print('labels in dbpedia and wikidata:', len(types), file=sys.stderr)
    print('labels read in %.1f seconds or %.1f minutes' % (t1 - t0,
                                                           (t1 - t0) / 60),
          file=sys.stderr)
    # sys.getsizeof does not work correctly, and pympler is WAY too slow
    # print('sizeof types %.3f GB  sizeof labels %.3f GB' % (sys.getsizeof(types)/GB, sys.getsizeof(labels)/GB,), file=sys.stderr)
    files = [f.strip() for f in open(filelist).readlines()]
    # process all files instead of sample?
    if ss == 0: ss = len(files)
    nrows = np.zeros(ss)
    ncols = np.zeros(ss)
    cols = {}
    svals = 0
    ne = 0
    rels = {}
    titles = {}
    typcnt = defaultdict(int)
    usecnt = defaultdict(int)
    vcnt = defaultdict(int)
    typsets = {}
    tabs = {}
    nval = {}
    stats = {}
    hdr = 0
    rtyp = defaultdict(int)  # consistent row types
    # pat = re.compile("^[0-9\.,/Q_-]*$")
    print('reading',
          ss,
          'out of total',
          len(files),
          'files from',
          filelist,
          file=sys.stderr)
    # random.seed(4713)
    sam = random.sample(files, ss)
    # sqlite default autocommit, even when scripted
    # we need one big transaction
    # otherwise autocommit --> each statement a separate transaction, VERY slow
    fout.write("begin transaction;\n")
    fout.write('DROP TABLE IF EXISTS val;\n')
    fout.write('DROP TABLE IF EXISTS tab;\n')
    fout.write('DROP TABLE IF EXISTS col;\n')
    fout.write('DROP TABLE IF EXISTS err;\n')
    fout.write('DROP TABLE IF EXISTS sel;\n')
    fout.write('DROP TABLE IF EXISTS sub;\n')
    fout.write('DROP TABLE IF EXISTS vcnt;\n')
    fout.write('DROP TABLE IF EXISTS lsh;\n')
    fout.write('DROP TABLE IF EXISTS cnts;\n')
    fout.write(
        'CREATE TABLE val (desc varchar, nvals int, wtyp int, frac float);\n')
    fout.write(
        'CREATE TABLE vcnt (val varchar, nv int, nt int, typs varchar);\n')
    fout.write(
        'CREATE TABLE tab (id int, rows int, cols int, head int, fn varchar, src varchar);\n'
    )
    fout.write(
        'CREATE TABLE col (tab int, col int, typ varchar(255), frac float, cov float);\n'
    )
    fout.write(
        'CREATE TABLE err (typ varchar, msg varchar, fn varchar, col int, src varchar);\n'
    )
    fout.write(
        'CREATE TABLE sel (tab int, col int, nval int, ndist int, sel int);\n')
    fout.write(
        'CREATE TABLE sub (l int, j int, k int, i int, comtyp int, fracsubset float, nchild int, nparent int, fnchild varchar, fnparent varchar);\n'
    )
    fout.write('CREATE TABLE lsh (k int, i int, l int, j int);\n')
    fout.write('CREATE TABLE cnts (desc varchar, cnt int);\n')
    # file for writing types of columns
    tsfile = open('typesets.csv', 'w', encoding='utf8', errors='ignore')
    tsfile.write('filename,column,fraction,type\n')
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('CSV Files:', ss))
    ne = 0
    nallcols = 0  # number of all columns
    # go through the files in the sample
    for k in range(ss):
        fn = sam[k]
        src = 'obd'  # fn[6:9]
        #  if src in ['kag', 'obd']: ftyp = 'csv'
        #  else: ftyp = 'json'
        res = readf(fn, 'csv')  # ftyp)
        if res == -1:
            ne += 1
            continue  # error reading file
        cols, h, nrows, ncols = res
        nallcols += len(cols)
        #
        # kaggle filenames contain ' !!!
        fout.write("insert into tab values (%d, %d, %d, %d, '%s', '%s');\n" %
                   (k, nrows, ncols, h, cleanfn(fn), src))
        nval[k] = {
        }  # lengths of columns as lists, after removing ignored elements
        tabs[k] = {}  # sets of col values, after removing ignored
        typsets[k] = {}
        stats[k] = {}
        ign = ('null', 'true', 'false', 't', 'f', 'yes', 'no', 'y', 'n',
               'none', 'na', 'n/a', 'nan', 'n.a.', 'male', 'female', 'm', 'f',
               'e')
        for i in range(len(cols)):
            # remove numeric and null-like
            lst = [
                x.strip() for x in cols[i] if len(x.strip()) > 0
                and not isnumber(x.strip()) and not x.strip().lower() in ign
            ]
            if len(lst) == 0:
                err('col', 'all num or null', fn, i)
                continue
            # log value counts
            for x in lst:
                vcnt[x] += 1
            # only need list length and set for finding reference cols, not whole list: save lots of memory
            nval[k][i] = len(lst)
            tabs[k][i] = set(lst)
            # selectivity
            sel = len(tabs[k][i]) / len(lst)
            fout.write(
                "insert into sel (tab, col, nval, ndist, sel) values(%d, %d, %d, %d, %f);\n"
                % (k, i, len(lst), len(tabs[k][i]), sel))
            # None types
            tsets = [types[x] for x in lst]
            ts = set.union(*tsets)
            typsets[k][i] = set()
            # all stats useless, performance worse
            if '--stats' in sys.argv:
                stats[k][i] = {}
                lens = [len(x) for x in slst]
                stats[k][i]['lmin'] = min(lens)
                stats[k][i]['lmax'] = max(lens)
                stats[k][i]['case'] = getcase(slst)
                stats[k][i]['alph'] = getalph(slst)
            for t in ts:
                # for more than one known value of type t: fraction of column values that are of type t
                if len(labels[t]) < 2: continue
                f = sum([int(t in s) for s in tsets]) / len(lst)
                # type coverage: fraction of col values of type t in relation to all known values of this type
                tset = set([x for x in lst
                            if t in types[x]])  # col vals of type t
                cov = len(tset) / len(labels[t])
                fout.write(
                    "insert into col (tab, col, typ, frac, cov) values (%d, %d, '%s', %f, %f);\n"
                    % (k, i, t.replace("'", "").replace('"', ''), f, cov))
                if f >= 0.5:
                    typsets[k][i].add(t)
                    if t != 'www.w3.org/2002/07/owl#Thing':
                        # column i in file fn has type t for at least fraction f of elements (set)
                        tsfile.write("%s,%d,%.3f,%s\n" % (fn, i, f, t))
            typsets[k][i].discard('www.w3.org/2002/07/owl#Thing'
                                  )  # messes up results. its always a thing
    t2 = time()
    print('files read in %.1f seconds or %.1f minutes' % (t2 - t1,
                                                          (t2 - t1) / 60),
          file=sys.stderr)
    # some statistics on columns and values (in the sets): number columns, number of values, length of values
    nc = sum([len(tabs[k]) for k in tabs])
    nv = sum([len(tabs[k][i]) for k in tabs for i in tabs[k]])
    lv = sum([len(s) for k in tabs for i in tabs[k] for s in tabs[k][i]])
    print(
        'number of tables: %d  cols: %d  avg col size (set): %.1f  avg item len: %.1f'
        % (len(tabs), nc, nv / nc, lv / nv))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('Import Errors:', ne))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('All Tables:', len(tabs)))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('All Columns:', nallcols))
    # print("brk:", brk) # no gain
    # print('sizeof tabs %.3f GB  sizeof typsets %.3f GB' % (sys.getsizeof(tabs)/GB, sys.getsizeof(typsets)/GB,), file=sys.stderr)
    # value and type counts, most frequent only
    for k in sorted(vcnt, key=vcnt.get, reverse=True)[:100]:
        fout.write(
            "insert into vcnt (val, nv, nt, typs) values ('%s', %d, %d, '%s');\n"
            % (k.replace("'", ""), vcnt[k], len(types[k]), ', '.join(
                list(types[k])[:5])))
    # overall statistics on values and types
    nv, wt = sum([vcnt[k] for k in vcnt
                  ]), sum([vcnt[k] * int(len(types[k]) > 0) for k in vcnt])
    fout.write(
        "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n"
        % ('Values', nv, wt, wt / nv))
    uv, wt = len(vcnt), sum([int(len(types[k]) > 0) for k in vcnt])
    fout.write(
        "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n"
        % ('Unique', uv, wt, wt / uv))
    #
    if '--refs' in sys.argv:
        print("finding reference columns..", file=sys.stderr)
        num_cpus = 8  # psutil.cpu_count(logical=False) # weird problems with that
        print("cpus:", num_cpus)
        print("init ray..", file=sys.stderr)
        # Starting Ray with .. GiB memory available for workers and up to .. GiB for objects.
        # ray.init(memory=<bytes>, object_store_memory=<bytes>).
        ray.init(num_cpus=num_cpus,
                 memory=9 * 1024 * 1024 * 1024,
                 object_store_memory=45 * 1024 * 1024 * 1024)
        print("put data into shared mem..", file=sys.stderr)
        tabs_id = ray.put(tabs)
        nval_id = ray.put(nval)
        stats_id = ray.put(stats)
        typsets_id = ray.put(typsets)
        sam_id = ray.put(sam)
        print("start parallel..", file=sys.stderr)
        # split task by assigning lists of keys in tabs to check. this will block until all are ready.
        sql = ray.get([
            refcols.remote(tabs_id, nval_id, stats_id, typsets_id, sam_id, tx)
            for tx in np.array_split(list(tabs.keys()), num_cpus)
        ])
        print("parallel section done.", file=sys.stderr)
        # write to file here, NOT in individual tasks
        for s in sql:
            fout.write(s)
        t3 = time()
        print('references done in %.1f seconds or %.1f minutes' %
              (t3 - t2, (t3 - t2) / 60),
              file=sys.stderr)
    #
    if '--lsh' in sys.argv:
        # locality sensitive hashing for minHashes of column value sets
        print('build hashes..', file=sys.stderr)
        # lsh = MinHashLSH(threshold=0.75, num_perm=128)
        lsh = MinHashLSHEnsemble(threshold=0.9, num_perm=128, num_part=32)
        mh = {}
        for k in tabs:
            mh[k] = {}
            for i in tabs[k]:
                mh[k][i] = MinHash(num_perm=128)
                for d in tabs[k][i]:
                    mh[k][i].update(d.encode('utf8'))
        #   lsh.insert((k,i), mh[k][i])
        lsh.index([((k, i), mh[k][i], len(tabs[k][i])) for k in tabs
                   for i in tabs[k]])
        t4 = time()
        print('hashes built in %.1f seconds or %.1f minutes' %
              (t4 - t3, (t4 - t3) / 60),
              file=sys.stderr)
        # similar cols according to lsh
        print("query lsh..")
        for k in tabs:
            for i in tabs[k]:
                #     for l, j in lsh.query(mh[k][i]):
                for l, j in lsh.query(mh[k][i], len(tabs[k][i])):
                    if l == k: continue  # same table
                    # l,j is the parent!
                    # min size for ref tab
                    if len(tabs[l][j]) < 10: continue
                    # selectivity must be 1
                    if len(tabs[l][j]) != nval[l][j]: continue
                    if len(tabs[k][i]) < 10 and len(tabs[l][j]) < 10:
                        print("TABS", k, i, tabs[k][i])
                        print("SIMT", l, j, tabs[l][j])
                    fout.write(
                        "insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n"
                        % (l, j, k, i))
    else:
        t4 = time()
        # need content for query?
        # fout.write("insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (0, 0, 0, 0))
    t5 = time()
    print('hash queries done in %.1f seconds or %.1f minutes' %
          (t5 - t4, (t5 - t4) / 60),
          file=sys.stderr)
    #
    fout.write("commit;\n")
    fout.close()
    tsfile.close()
    print('total run time     %.1f seconds or %.1f minutes' % (t5 - t0,
                                                               (t5 - t0) / 60),
          file=sys.stderr)
    sys.exit(0)
コード例 #9
0
from datasketch import MinHash, MinHashLSH, MinHashLSHEnsemble
data1 = ['这个', '程序', '代码', '太乱', '那个', '代码', '规范']
data2 = ['这个', '程序', '代码', '不', '规范', '那个', '更', '规范']
data3 = ['这个', '程序', '代码', '不', '规范', '那个', '规范', '些']

# 创建MinHash对象
m1 = MinHash()
m2 = MinHash()
m3 = MinHash()
for d in data1:
	m1.update(d.encode('utf8'))
for d in data2:
	m2.update(d.encode('utf8'))
for d in data3:
	m3.update(d.encode('utf8'))
# 创建LSH Ensemble
lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128)
# Index takes an iterable of (key, minhash, size)
lshensemble.index([("m2", m2, len(data2)), ("m3", m3, len(data3))])
# 判断lshensemble是否存在m2, m3
print("m2" in lshensemble)
print("m3" in lshensemble)
# 查询与m1相似度大于0.8的集合
print("与m1相似度大于0.8的集合:")
for key in lshensemble.query(m1, len(data1)):
    print(key)