Ejemplo n.º 1
0
def benchmark_lshensemble(threshold, num_perm, num_part, m, index_data,
                          query_data):
    print("Building LSH Ensemble index")
    (minhashes, indexed_sets, keys) = index_data
    lsh = MinHashLSHEnsemble(threshold=threshold,
                             num_perm=num_perm,
                             num_part=num_part,
                             m=m)
    lsh.index((key, minhash, len(s))
            for key, minhash, s in \
                    zip(keys, minhashes[num_perm], indexed_sets))
    print("Querying")
    (minhashes, sets, keys) = query_data
    probe_times = []
    process_times = []
    results = []
    for qs, minhash in zip(sets, minhashes[num_perm]):
        # Record probing time
        start = time.perf_counter()
        result = list(lsh.query(minhash, len(qs)))
        probe_times.append(time.perf_counter() - start)
        # Record post processing time.
        start = time.perf_counter()
        [_compute_containment(qs, indexed_sets[key]) for key in result]
        process_times.append(time.perf_counter() - start)
        results.append(result)
        sys.stdout.write("\rQueried {} sets".format(len(results)))
    sys.stdout.write("\n")
    return results, probe_times, process_times
Ejemplo n.º 2
0
def load_LSH(lib_profiles,
             mode=MODE.SCALABLE,
             repackage=False,
             processes=None):
    """Load library profiles to an LSH object.
    
    Args:
        lib_profiles (list): The list of library profiles.
        mode (<enum 'MODE'>, optional): Defaults to MODE.SCALABLE. The detection mode. Either MODE.ACCURATE or MODE.SCALABLE. See the paper for more details.
        repackage (bool, optional): Defaults to False. Should LibID consider classes repackaging? This should only be enabled if already know classes repackaging is applied. 
        processes (int, optional): Defaults to None. The number of processes to use. If processes is None then the number returned by cpu_count() is used.
    """

    global LSH, LIB_RELATIONSHIP_GRAPHS

    weights = (0.5, 0.5) if repackage else (0.1, 0.9)
    LSH = MinHashLSHEnsemble(threshold=LSH_THRESHOLD,
                             num_perm=LSH_PERM_NUM,
                             num_part=32,
                             weights=weights)

    (minhash_list,
     LIB_RELATIONSHIP_GRAPHS) = profiler.parallel_load_libs_profile(
         lib_profiles=lib_profiles,
         mode=mode,
         repackage=repackage,
         processes=processes)

    LOGGER.info("Start indexing LSH (this could take a while) ...")

    start_time = time.time()
    LSH.index(minhash_list)
    end_time = time.time()

    LOGGER.info("LSH indexed. Duration: %fs", end_time - start_time)
    def cluster_obs_group(self, candidates):
        ensemble = MinHashLSHEnsemble(threshold=0.95, num_perm=128)
        ensemble.index((c,) + self.hashes[c]
                       for c in candidates
                       if self.hashes[c][1] > 0)

        clusters = []
        while candidates:
            rep = candidates.pop()
            clus = [rep]
            h, l = self.hashes[rep]
            if l == 0:
                # An empty representative will cause division by
                # zero in ensemble.query(); instead, group it with
                # all the other empty candidates, and no others.
                for other in list(candidates):
                    if self.hashes[other][1] == 0:
                        clus.append(other)
                        candidates.discard(other)
            else:
                for other in ensemble.query(h, l):
                    if other in candidates:
                        clus.append(other)
                        candidates.discard(other)
            clusters.append(clus)
        return clusters
Ejemplo n.º 4
0
 def __init__(self,
              value_format: str = 'h',
              threshold: float = 0.8,
              num_perm: int = 128,
              num_part: int = 32,
              tokenizer: Tokenizer = Tokenizer('zh')):
     self.value_format = value_format
     self.threshold = threshold
     self.num_perm = num_perm
     self.num_part = num_part
     self.tokenizer = tokenizer
     self.lsh = MinHashLSHEnsemble(threshold=self.threshold,
                                   num_perm=self.num_perm)
     self.record_dawg = dawg.RecordDAWG(self.value_format)
Ejemplo n.º 5
0
def benchmark_lshensemble(threshold, num_perm, num_part, l, index_data,
                          query_data):
    print("Building LSH Ensemble index")
    lsh = MinHashLSHEnsemble(threshold=threshold,
                             num_perm=num_perm,
                             num_part=num_part,
                             l=l)
    lsh.index((key, minhash, len(set))
                  for key, minhash, set in \
                          zip(index_data.keys, index_data.minhashes[num_perm], index_data.sets))
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = list(lsh.query(minhash, len(qs)))
        duration = time.clock() - start
        times.append(duration)
        results.append(
            sorted([[key, _compute_containment(qs, index_data.sets[key])]
                    for key in result],
                   key=lambda x: x[1],
                   reverse=True))
    return times, results
Ejemplo n.º 6
0
    def exec_comparison_heuristic(self, source_func: BDFunction, target_func: BDFunction) \
            -> bool:

        # Populate the attribute values
        source_hash = self.loaded_attributes[
            'FunctionMinHashLSH'].extract_attribute(source_func)

        target_hash = self.loaded_attributes[
            'FunctionMinHashLSH'].extract_attribute(target_func)

        # Create an LSH Ensemble index with threshold and number of partition
        # settings.
        lshensemble = MinHashLSHEnsemble(
            threshold=Configuration.MINHASH_LSH_ENSEMBLE_THRESHOLD,
            num_perm=Configuration.MINHASH_PERMUTATIONS,
            num_part=Configuration.MINHASH_LSH_ENSEMBLE_PARTITIONS)

        lshensemble.index([("source_function", source_hash, len()),
                           ("m3", m3, len(set3))])

        if source_hash == target_hash:
            return True

        return False
Ejemplo n.º 7
0
    def __init__(self, hash_type=None, bits=None, hash_func=None, params=None):
        self.hash_type = hash_type
        self.hash_func = hash_func
        self.hash_bits = bits
        self.hashfunc = sha1_hash32
        if self.hash_bits in {32, "32", None}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash32
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh32
            else:
                # "hash32","default":
                self.hashfunc = sha1_hash32

        elif self.hash_bits in {64, "64"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash64
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash64
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh64
            else:
                self.hashfunc = sha1_hash64

        elif self.hash_bits in {128, "128"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash128
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash128
            else:
                raise ValueError("请检查对应的hash函数类型与位数")

        else:
            raise ValueError("请检查对应的hash函数的位数")

        if not params:
            params = {}
        """
        若只用redis 作为存储截止
        配置
        storage_config={  
        'type': 'redis',
        'redis': {'host': 'localhost', 'port': 6379},
        }
                
        要顺序插入大量MinHash,建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。
        data_list = [("m1", m1), ("m2", m2), ("m3", m3)]
        with lsh.insertion_session() as session:
            for key, minhash in data_list:
                session.insert(key, minhash)
        请注意,在打开插入会话期间查询LSH对象可能会导致不一致。
        
        MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例(例如,
        当您使用MinHashLSH逐步对文档进行群集时)。
        Cassandra存储选项可以配置如下:
        
         storage_config={
        'type': 'cassandra',
        'cassandra': {
            'seeds': ['127.0.0.1'],
            'keyspace': 'lsh_test',
            'replication': {
                'class': 'SimpleStrategy',
                'replication_factor': '1',
            },
            'drop_keyspace': False,
            'drop_tables': False,
        }}
        参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间(如果尚不存在)时要使用的参数。
        如果要强制创建表或键空间(因此要删除现有表或键空间),请将drop_tables和drop_keyspace选项设置为 True。
        像Redis副本一样,建议使用插入会话来减少批量插入期间的网络调用数量。
        
        +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 
        如果您的LSH使用外部存储层(例如Redis),则可以跨多个进程共享它。有两种方法可以做到这一点:
        
        推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的,因此您可以调用pickle:
        
        import pickle
        
        # Create your LSH object
        lsh = ...
        # Serialize the LSH
        data = pickle.dumps(lsh)
        # Now you can pass it as an argument to a forked process or simply save it
        # in an external storage.
        
        # In a different process, deserialize the LSH
        lsh = pickle.loads(data)
        使用pickle,您可以保存有关LSH所需的所有知识,例如在一个位置中进行各种参数设置。
        另外,您可以在首次创建LSH时在存储配置中指定基本名称。例如:

        # For Redis.
        lsh = MinHashLSH(
            threshold=0.5, num_perm=128, storage_config={
                'type': 'redis',
                'basename': b'unique_name_6ac4fg',
                'redis': {'host': 'localhost', 'port': 6379},
            }
        )
        
         # For Cassandra.
         lsh = MinHashLSH(
            threashold=0.5, num_perm=128, storage_config={
                'type': 'cassandra',
                'basename': b'unique_name',
                'cassandra': {
                    'seeds': ['127.0.0.1'],
                    'keyspace': 'lsh_test',
                    'replication': {
                        'class': 'SimpleStrategy',
                        'replication_factor': '1',
                    },
                    'drop_keyspace': False,
                    'drop_tables': False,
                }
            }
        )
        的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此,如果使用相同的基名创建新的LSH对象,则将在与旧LSH关联的存储层中使用相同的基础数据。
        
        如果不指定basename,则MinHash LSH将生成一个随机字符串作为基本名称,并且极不可能发生冲突。
        
        更详细的使用见 文档 :http://ekzhu.com/datasketch/lsh.html
        """

        if self.hash_type in {"minhash", "MinHash"}:
            # 主要计算Jaccard 的相似度, 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度
            self.hash = MinHash(
                num_perm=params.get(
                    "num_perm",
                    128),  # int可选项, 如果hashvalues值不是None,则被忽略。随机排列函数的数量
                # 用来控制hash 的精度
                seed=params.get("seed", 1),  # 随机种子 可选
                hashfunc=self.
                hashfunc,  # 可选 使用的hash函数,将输入传递给update 方法。并返回一个可以用32位编码的整数
                hashobj=params.get("hashobj",
                                   None),  # Deprecated.已经被hashfunc 代替
                hashvalues=params.get("hashvalues", None),  # 可选 数组或列表
                permutations=params.get(
                    "permutations",
                    None))  # 置换函数参数, 可选,可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化
        elif self.hash_type in {
                "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh",
                "MinHashLSH"
        }:  # 加权的最小哈希局部敏感哈希
            #  WeightedMinHashLSH()   与 MinHashLSH 等价  。 加权jaccard 相似度 查询
            # 不支持top-k查询, 但minhashlshforest 支持top-k
            self.hash = MinHashLSH(
                threshold=params.get("threshold", 0.9),  # 杰卡德距离的阈值
                num_perm=params.get("num_perm",
                                    128),  # 置换函数设定个数, 在加权minihash 上的 样本规模大小
                weights=params.get("weights",
                                   (0.5, 0.5)),  # 元组, 可选项, 优化jaccard阈值
                params=params.get("params", None),  # 元组,可选项, – bands 的数量与规模大小
                storage_config=params.get("storage_config", None),  # 存储配置
                prepickle=params.get("prepickle", None))  # 默认使用pk格式存储
        elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}:
            # 相比MinHash 中,内存更小的哈希。
            self.hash = LeanMinHash(minhash=params.get("minhash", None),
                                    seed=params.get("seed", None),
                                    hashvalues=params.get("hashvalues", None))

        elif self.hash_type in {
                "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF"
        }:
            self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128),
                                         l=params.get("l", 8))

        elif self.hash_type in {
                "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE"
        }:
            # 使用新距离做的minhashlsh操作 , 即使用Containment 中文简称为遏制
            self.hash = MinHashLSHEnsemble(
                threshold=params.get("threshold", 0.9),
                num_perm=params.get("num_perm", 128),
                num_part=params.get("num_part", 16),  #
                m=params.get("m", 8),
                weights=params.get("weights", (0.5, 0.5)),
                storage_config=params.get("storage_config", None),
                prepickle=params.get("prepickle", None))

        elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}:
            # 相关的接口与HyperLogLog 相同
            # HyperLogLog能够使用较小且固定的内存空间,单次估算数据集的基数(不同值的数量)
            self.hash = HyperLogLog(
                p=params.get("p", 8),  #  与MinHash 中的数据相比较,num_perm  用于控制精度
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash32),  # 内部使用的hash 算法
                hashobj=params.get("hashobj",
                                   None))  # 可选 数组或列表,  使用hashfunc 代替了

        elif self.hash_type in {
                "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++",
                "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++"
        }:
            # 相关的接口与HyperLogLog 相同
            self.hash = HyperLogLogPlusPlus(
                p=params.get("p", 8),
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash64),  # 使用的64位的hash 算法
                hashobj=params.get("hashobj", None))

        else:
            raise ValueError("请选择正确的函数函数对象")
Ejemplo n.º 8
0
    "rocket", "bike", "scooter", "motorcyle", "SUV", "jet", "horse"
])

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))

# Create an LSH Ensemble index with a threshold
lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128)

# Index takes an iterable of (key, minhash, size)
lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))])

# Check for membership using the key
print("m2" in lshensemble)
print("m3" in lshensemble)

# Using m1 as the query, get an result iterator
print("Sets with containment > 0.2:")
for key in lshensemble.query(m1, len(set1)):
    print(key)

from datasketch import HyperLogLog, HyperLogLogPlusPlus
Ejemplo n.º 9
0
def main(corpus: str, mode: str, lsh_file: str, minhash_file: str,
         num_perm: int, shingles: int, threshold: float, n_jobs: int,
         output_dir: str):
    if mode != 'query':
        assert not lsh_file
    if mode == 'minhash-only':
        assert not minhash_file

    print("Making output dir:", output_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir()

    if minhash_file:
        print('Loading MinHashes from disk:', minhash_file)
        start = time.time()
        with open(minhash_file, 'rb') as f:
            cached_minhashes = pickle.load(f)
        print("Done loading MinHashes, time elapsed (sec):",
              time.time() - start)

        corpus_len = len(cached_minhashes)
        minhash_iter = starmap(lambda k, v: (k, v, None),
                               cached_minhashes.items())
        minhashes = None  # Set to none to disable saving minhashes again
    else:
        if corpus == 'webtext':
            corpus_len = 8_282_020
            corpus_iter = make_corpus_iter(DATA_DIR / 'webtext_detokenized')
        elif corpus == 'openwebtext':
            corpus_len = 8_013_769
            corpus_iter = make_corpus_iter(DATA_DIR / 'openwebtext_shards')
        else:
            raise RuntimeError
        print("Using", n_jobs, "processes for MinHashing")

        minhashes = {}
        minhash_iter = parallel_create_minhashes(corpus_iter,
                                                 shingles=shingles,
                                                 num_perm=num_perm,
                                                 n_jobs=n_jobs)

    print("Starting...")
    if mode == 'lsh' or mode == 'lsh-ensemble':
        if mode == 'lsh':
            lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
            with lsh.insertion_session() as session:
                for key, minhash, size in tqdm(minhash_iter,
                                               total=corpus_len,
                                               desc='Making MinHashLSH'):
                    if minhashes:
                        minhashes[key] = minhash
                    session.insert(
                        key, minhash,
                        check_duplication=False)  # All keys are unique doc ids
        else:
            assert mode == 'lsh-ensemble'
            lsh = MinHashLSHEnsemble(threshold=threshold,
                                     num_perm=num_perm,
                                     num_part=16)  # TODO: try 32
            lsh.index(
                tqdm(minhash_iter,
                     total=corpus_len,
                     desc='Making MinHashLSHEnsemble'))

        # Save LSH
        print("Saving LSH...")
        start = time.time()
        with open(output_dir / 'lsh.pkl', 'wb') as f:
            pickle.dump(lsh, f)
        print("Done saving LSH, time elapsed (sec):", time.time() - start)
    elif mode == 'query':
        print('Loading LSH:', lsh_file)
        start = time.time()
        with open(lsh_file, 'rb') as f:
            lsh = pickle.load(f)
        assert isinstance(lsh, MinHashLSH) and lsh.h == num_perm
        print("Done loading LSH, time elapsed (sec):", time.time() - start)

        duplicates_file = output_dir / 'duplicates.jsonl'
        print("Writing duplicates to", duplicates_file)
        with open(duplicates_file, 'a') as f:
            for key, minhash, size in tqdm(minhash_iter,
                                           total=corpus_len,
                                           desc='Querying MinHashLSH'):
                if minhashes:
                    minhashes[key] = minhash
                duplicates = lsh.query(minhash)
                if duplicates:
                    json.dump({key: duplicates}, f)
                    f.write('\n')
    elif mode == 'minhash-only':
        assert minhashes is not None
        for key, minhash, size in tqdm(minhash_iter,
                                       total=corpus_len,
                                       desc='MinHashing'):
            minhashes[key] = minhash
    else:
        raise RuntimeError

    # Save MinHashes
    if not minhash_file:
        print("Saving MinHashes...")
        start = time.time()
        with open(output_dir / 'minhashes.pkl', 'wb') as f:
            pickle.dump(minhashes, f)
        print("Done saving MinHashes, time elapsed (sec):",
              time.time() - start)
Ejemplo n.º 10
0
class Sto:
    def __init__(self,
                 value_format: str = 'h',
                 threshold: float = 0.8,
                 num_perm: int = 128,
                 num_part: int = 32,
                 tokenizer: Tokenizer = Tokenizer('zh')):
        self.value_format = value_format
        self.threshold = threshold
        self.num_perm = num_perm
        self.num_part = num_part
        self.tokenizer = tokenizer
        self.lsh = MinHashLSHEnsemble(threshold=self.threshold,
                                      num_perm=self.num_perm)
        self.record_dawg = dawg.RecordDAWG(self.value_format)

    def __check_get_store_path(self, data_path: Path):
        pnlp.check_dir(data_path)
        lsh_path = Path(data_path) / "lsh.pickle"
        dawg_path = Path(data_path) / "values.dawg"
        return lsh_path, dawg_path

    def load(self, data_path: Path):
        lsh_path, dawg_path = self.__check_get_store_path(data_path)
        if lsh_path.exists():
            self.lsh = load_pickle(lsh_path)
        else:
            raise ValueError("lsh pickle: {} not exist.".format(lsh_path))
        if dawg_path.exists():
            self.record_dawg.load(str(dawg_path))
        else:
            raise ValueError("dawg file: {} not exist.".format(dawg_path))

    def store(self, data_path: Path):
        lsh_path, dawg_path = self.__check_get_store_path(data_path)
        dump_pickle(lsh_path, self.lsh)
        self.record_dawg.save(dawg_path)

    def __check_value_format(self, val: tuple):
        if len(val) != len(self.value_format):
            raise ValueError(
                "value format {} does not match the value {}".format(
                    self.value_format, val))

    def add(self, text_list: List[str], value_list: List[tuple]):
        len_text = len(text_list)
        len_value = len(value_list)
        assert len_text == len_value
        data = {}
        entries = []
        for i, text in enumerate(text_list):
            entry = self.text_to_lsh_entry(text)
            key = entry[0]
            if key in data:
                continue
            value = value_list[i]
            self.__check_value_format(value)
            data[key] = value
            entries.append(entry)
        self.lsh.index(entries)
        self.record_dawg = dawg.RecordDAWG(self.value_format, data.items())

    def query(self, text: str):
        key, mh, length = self.text_to_lsh_entry(text)
        if key in self.record_dawg:
            return self.record_dawg.get(key)[0]

        for sim_key in self.lsh.query(mh, length):
            return self.record_dawg.get(sim_key)[0]
        else:
            return

    def text_to_lsh_entry(self, text: str):
        words = self.tokenizer(text)
        bigrams = list(ngrams(words, 1, 2))
        wset = set(bigrams)
        mh = MinHash(num_perm=self.num_perm)
        for w in wset:
            mh.update(w.encode('utf8'))
        unicode_hash = hashlib.sha1(text.encode("utf8")).hexdigest()
        return (unicode_hash, mh, len(wset))

    def __getitem__(self, key: str):
        return self.query(key)

    def __setitem__(self, key: str, value: tuple):
        raise NotImplementedError

    def __contains__(self, key: str):
        if self.query(key):
            return True
        return False

    def __len__(self):
        return len(self.record_dawg.keys())
Ejemplo n.º 11
0
def main():
    ss = int(sys.argv[1])
    filelist = sys.argv[2]
    GB = 1024 * 1024 * 1024
    t0 = time()
    if '--profile' in sys.argv or '--timing' in sys.argv:
        types, labels = defaultdict(set), defaultdict(
            set)  # save time for profiling test
    else:
        print('reading typeslabels.pkl ..', file=sys.stderr)
        f = open('typeslabels.pkl', 'rb')
        types, labels = pickle.load(f)
        f.close()
    t1 = time()
    print('labels in dbpedia and wikidata:', len(types), file=sys.stderr)
    print('labels read in %.1f seconds or %.1f minutes' % (t1 - t0,
                                                           (t1 - t0) / 60),
          file=sys.stderr)
    # sys.getsizeof does not work correctly, and pympler is WAY too slow
    # print('sizeof types %.3f GB  sizeof labels %.3f GB' % (sys.getsizeof(types)/GB, sys.getsizeof(labels)/GB,), file=sys.stderr)
    files = [f.strip() for f in open(filelist).readlines()]
    # process all files instead of sample?
    if ss == 0: ss = len(files)
    nrows = np.zeros(ss)
    ncols = np.zeros(ss)
    cols = {}
    svals = 0
    ne = 0
    rels = {}
    titles = {}
    typcnt = defaultdict(int)
    usecnt = defaultdict(int)
    vcnt = defaultdict(int)
    typsets = {}
    tabs = {}
    nval = {}
    stats = {}
    hdr = 0
    rtyp = defaultdict(int)  # consistent row types
    # pat = re.compile("^[0-9\.,/Q_-]*$")
    print('reading',
          ss,
          'out of total',
          len(files),
          'files from',
          filelist,
          file=sys.stderr)
    # random.seed(4713)
    sam = random.sample(files, ss)
    # sqlite default autocommit, even when scripted
    # we need one big transaction
    # otherwise autocommit --> each statement a separate transaction, VERY slow
    fout.write("begin transaction;\n")
    fout.write('DROP TABLE IF EXISTS val;\n')
    fout.write('DROP TABLE IF EXISTS tab;\n')
    fout.write('DROP TABLE IF EXISTS col;\n')
    fout.write('DROP TABLE IF EXISTS err;\n')
    fout.write('DROP TABLE IF EXISTS sel;\n')
    fout.write('DROP TABLE IF EXISTS sub;\n')
    fout.write('DROP TABLE IF EXISTS vcnt;\n')
    fout.write('DROP TABLE IF EXISTS lsh;\n')
    fout.write('DROP TABLE IF EXISTS cnts;\n')
    fout.write(
        'CREATE TABLE val (desc varchar, nvals int, wtyp int, frac float);\n')
    fout.write(
        'CREATE TABLE vcnt (val varchar, nv int, nt int, typs varchar);\n')
    fout.write(
        'CREATE TABLE tab (id int, rows int, cols int, head int, fn varchar, src varchar);\n'
    )
    fout.write(
        'CREATE TABLE col (tab int, col int, typ varchar(255), frac float, cov float);\n'
    )
    fout.write(
        'CREATE TABLE err (typ varchar, msg varchar, fn varchar, col int, src varchar);\n'
    )
    fout.write(
        'CREATE TABLE sel (tab int, col int, nval int, ndist int, sel int);\n')
    fout.write(
        'CREATE TABLE sub (l int, j int, k int, i int, comtyp int, fracsubset float, nchild int, nparent int, fnchild varchar, fnparent varchar);\n'
    )
    fout.write('CREATE TABLE lsh (k int, i int, l int, j int);\n')
    fout.write('CREATE TABLE cnts (desc varchar, cnt int);\n')
    # file for writing types of columns
    tsfile = open('typesets.csv', 'w', encoding='utf8', errors='ignore')
    tsfile.write('filename,column,fraction,type\n')
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('CSV Files:', ss))
    ne = 0
    nallcols = 0  # number of all columns
    # go through the files in the sample
    for k in range(ss):
        fn = sam[k]
        src = 'obd'  # fn[6:9]
        #  if src in ['kag', 'obd']: ftyp = 'csv'
        #  else: ftyp = 'json'
        res = readf(fn, 'csv')  # ftyp)
        if res == -1:
            ne += 1
            continue  # error reading file
        cols, h, nrows, ncols = res
        nallcols += len(cols)
        #
        # kaggle filenames contain ' !!!
        fout.write("insert into tab values (%d, %d, %d, %d, '%s', '%s');\n" %
                   (k, nrows, ncols, h, cleanfn(fn), src))
        nval[k] = {
        }  # lengths of columns as lists, after removing ignored elements
        tabs[k] = {}  # sets of col values, after removing ignored
        typsets[k] = {}
        stats[k] = {}
        ign = ('null', 'true', 'false', 't', 'f', 'yes', 'no', 'y', 'n',
               'none', 'na', 'n/a', 'nan', 'n.a.', 'male', 'female', 'm', 'f',
               'e')
        for i in range(len(cols)):
            # remove numeric and null-like
            lst = [
                x.strip() for x in cols[i] if len(x.strip()) > 0
                and not isnumber(x.strip()) and not x.strip().lower() in ign
            ]
            if len(lst) == 0:
                err('col', 'all num or null', fn, i)
                continue
            # log value counts
            for x in lst:
                vcnt[x] += 1
            # only need list length and set for finding reference cols, not whole list: save lots of memory
            nval[k][i] = len(lst)
            tabs[k][i] = set(lst)
            # selectivity
            sel = len(tabs[k][i]) / len(lst)
            fout.write(
                "insert into sel (tab, col, nval, ndist, sel) values(%d, %d, %d, %d, %f);\n"
                % (k, i, len(lst), len(tabs[k][i]), sel))
            # None types
            tsets = [types[x] for x in lst]
            ts = set.union(*tsets)
            typsets[k][i] = set()
            # all stats useless, performance worse
            if '--stats' in sys.argv:
                stats[k][i] = {}
                lens = [len(x) for x in slst]
                stats[k][i]['lmin'] = min(lens)
                stats[k][i]['lmax'] = max(lens)
                stats[k][i]['case'] = getcase(slst)
                stats[k][i]['alph'] = getalph(slst)
            for t in ts:
                # for more than one known value of type t: fraction of column values that are of type t
                if len(labels[t]) < 2: continue
                f = sum([int(t in s) for s in tsets]) / len(lst)
                # type coverage: fraction of col values of type t in relation to all known values of this type
                tset = set([x for x in lst
                            if t in types[x]])  # col vals of type t
                cov = len(tset) / len(labels[t])
                fout.write(
                    "insert into col (tab, col, typ, frac, cov) values (%d, %d, '%s', %f, %f);\n"
                    % (k, i, t.replace("'", "").replace('"', ''), f, cov))
                if f >= 0.5:
                    typsets[k][i].add(t)
                    if t != 'www.w3.org/2002/07/owl#Thing':
                        # column i in file fn has type t for at least fraction f of elements (set)
                        tsfile.write("%s,%d,%.3f,%s\n" % (fn, i, f, t))
            typsets[k][i].discard('www.w3.org/2002/07/owl#Thing'
                                  )  # messes up results. its always a thing
    t2 = time()
    print('files read in %.1f seconds or %.1f minutes' % (t2 - t1,
                                                          (t2 - t1) / 60),
          file=sys.stderr)
    # some statistics on columns and values (in the sets): number columns, number of values, length of values
    nc = sum([len(tabs[k]) for k in tabs])
    nv = sum([len(tabs[k][i]) for k in tabs for i in tabs[k]])
    lv = sum([len(s) for k in tabs for i in tabs[k] for s in tabs[k][i]])
    print(
        'number of tables: %d  cols: %d  avg col size (set): %.1f  avg item len: %.1f'
        % (len(tabs), nc, nv / nc, lv / nv))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('Import Errors:', ne))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('All Tables:', len(tabs)))
    fout.write("insert into cnts (desc, cnt) values ('%s', %d);\n" %
               ('All Columns:', nallcols))
    # print("brk:", brk) # no gain
    # print('sizeof tabs %.3f GB  sizeof typsets %.3f GB' % (sys.getsizeof(tabs)/GB, sys.getsizeof(typsets)/GB,), file=sys.stderr)
    # value and type counts, most frequent only
    for k in sorted(vcnt, key=vcnt.get, reverse=True)[:100]:
        fout.write(
            "insert into vcnt (val, nv, nt, typs) values ('%s', %d, %d, '%s');\n"
            % (k.replace("'", ""), vcnt[k], len(types[k]), ', '.join(
                list(types[k])[:5])))
    # overall statistics on values and types
    nv, wt = sum([vcnt[k] for k in vcnt
                  ]), sum([vcnt[k] * int(len(types[k]) > 0) for k in vcnt])
    fout.write(
        "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n"
        % ('Values', nv, wt, wt / nv))
    uv, wt = len(vcnt), sum([int(len(types[k]) > 0) for k in vcnt])
    fout.write(
        "insert into val (desc, nvals, wtyp, frac) values ('%s', %d, %d, %f);\n"
        % ('Unique', uv, wt, wt / uv))
    #
    if '--refs' in sys.argv:
        print("finding reference columns..", file=sys.stderr)
        num_cpus = 8  # psutil.cpu_count(logical=False) # weird problems with that
        print("cpus:", num_cpus)
        print("init ray..", file=sys.stderr)
        # Starting Ray with .. GiB memory available for workers and up to .. GiB for objects.
        # ray.init(memory=<bytes>, object_store_memory=<bytes>).
        ray.init(num_cpus=num_cpus,
                 memory=9 * 1024 * 1024 * 1024,
                 object_store_memory=45 * 1024 * 1024 * 1024)
        print("put data into shared mem..", file=sys.stderr)
        tabs_id = ray.put(tabs)
        nval_id = ray.put(nval)
        stats_id = ray.put(stats)
        typsets_id = ray.put(typsets)
        sam_id = ray.put(sam)
        print("start parallel..", file=sys.stderr)
        # split task by assigning lists of keys in tabs to check. this will block until all are ready.
        sql = ray.get([
            refcols.remote(tabs_id, nval_id, stats_id, typsets_id, sam_id, tx)
            for tx in np.array_split(list(tabs.keys()), num_cpus)
        ])
        print("parallel section done.", file=sys.stderr)
        # write to file here, NOT in individual tasks
        for s in sql:
            fout.write(s)
        t3 = time()
        print('references done in %.1f seconds or %.1f minutes' %
              (t3 - t2, (t3 - t2) / 60),
              file=sys.stderr)
    #
    if '--lsh' in sys.argv:
        # locality sensitive hashing for minHashes of column value sets
        print('build hashes..', file=sys.stderr)
        # lsh = MinHashLSH(threshold=0.75, num_perm=128)
        lsh = MinHashLSHEnsemble(threshold=0.9, num_perm=128, num_part=32)
        mh = {}
        for k in tabs:
            mh[k] = {}
            for i in tabs[k]:
                mh[k][i] = MinHash(num_perm=128)
                for d in tabs[k][i]:
                    mh[k][i].update(d.encode('utf8'))
        #   lsh.insert((k,i), mh[k][i])
        lsh.index([((k, i), mh[k][i], len(tabs[k][i])) for k in tabs
                   for i in tabs[k]])
        t4 = time()
        print('hashes built in %.1f seconds or %.1f minutes' %
              (t4 - t3, (t4 - t3) / 60),
              file=sys.stderr)
        # similar cols according to lsh
        print("query lsh..")
        for k in tabs:
            for i in tabs[k]:
                #     for l, j in lsh.query(mh[k][i]):
                for l, j in lsh.query(mh[k][i], len(tabs[k][i])):
                    if l == k: continue  # same table
                    # l,j is the parent!
                    # min size for ref tab
                    if len(tabs[l][j]) < 10: continue
                    # selectivity must be 1
                    if len(tabs[l][j]) != nval[l][j]: continue
                    if len(tabs[k][i]) < 10 and len(tabs[l][j]) < 10:
                        print("TABS", k, i, tabs[k][i])
                        print("SIMT", l, j, tabs[l][j])
                    fout.write(
                        "insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n"
                        % (l, j, k, i))
    else:
        t4 = time()
        # need content for query?
        # fout.write("insert into lsh (k, i, l, j) values (%d, %d, %d, %d);\n" % (0, 0, 0, 0))
    t5 = time()
    print('hash queries done in %.1f seconds or %.1f minutes' %
          (t5 - t4, (t5 - t4) / 60),
          file=sys.stderr)
    #
    fout.write("commit;\n")
    fout.close()
    tsfile.close()
    print('total run time     %.1f seconds or %.1f minutes' % (t5 - t0,
                                                               (t5 - t0) / 60),
          file=sys.stderr)
    sys.exit(0)