Exemple #1
0
 def _test_calc_big(self, devices):
     numpy.random.seed(0)
     data = numpy.random.randint(0, 100, (6400, 130))
     mask = numpy.random.randint(0, 5, data.shape)
     data *= (mask >= 4)
     del mask
     bgen = WeightedMinHashGenerator(data.shape[-1])
     gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, devices=devices, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas)
     m = csr_matrix(data, dtype=numpy.float32)
     print(m.nnz / (m.shape[0] * m.shape[1]))
     ts = time()
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     print("libMHCUDA:", time() - ts)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (len(data), 128, 2))
     ts = time()
     true_hashes = numpy.array([bgen.minhash(line).hashvalues for line in data],
                               dtype=numpy.uint32)
     print("datasketch:", time() - ts)
     self.assertEqual(true_hashes.shape, (len(data), 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         for r in range(hashes.shape[0]):
             if (hashes[r] != true_hashes[r]).any():
                 print("first invalid row:", r)
                 print(hashes[r])
                 print(true_hashes[r])
                 break
         raise e from None
Exemple #2
0
 def __init__(self, fileName):
     self.wlWinN = 200
     self.wlLagN = 10
     self.fqWinN = 200
     self.fqLagN = 10
     self.fqRspN = 32
     self.wlRspN = 32
     self.selmax = 50
     self.wl_x_level = 3
     self.vectLen = self.fqRspN * self.wlRspN
     self.wmg = WeightedMinHashGenerator(self.vectLen,
                                         sample_size=2,
                                         seed=12)
     self.sta = False
     self.sphs = self.fqLagN * self.wlLagN / 100
     self.GetSta(fileName, force=True)
     for fn in fileName:
         self.dt = self.GetData(fileName)[:500000]
     self.datalen = int(
         (len(self.dt) - self.wlLagN * self.wlWinN - self.fqWinN) /
         self.fqLagN / self.wlLagN)
     #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z")
     tpd = np.load("template.npz")
     tempdata = tpd['c']
     for itrx in range(6):
         itr = np.random.randint(100)
         start = itrx * 50000 + 127
         end = start + len(tempdata[itr])
         #print(start,end,itr,np.shape(tempdata[itr]),np.shape(self.dt[start:end]))
         self.dt[start:end] = self.dt[start:end] + tempdata[itr] * 0.1
     x = np.linspace(0, 5000, len(self.dt))
     plt.plot(x, self.dt)
     plt.show()
    def GetFingerPoint(self, hashbit):
        import simhash
        import mynilsimsa
        from datasketch import WeightedMinHashGenerator
        """
        schar=[]
        for iy in range(len(self.wlData)):
            tsc=[]
            for ix in range(len(self.wlData[0])):
                if(self.wlData[iy,ix]==1):
                    tsc.append('a')
                elif(self.wlData[iy,ix]==-1):
                    tsc.append('c')
                else:
                    tsc.append('d')
            schar.append(tsc)
      
        for cr in schar: 
            hh=simhash.simhash(''.join(cr),hashbits=hashbit)
            self.hash.append(hh.hash)

        """
        #"""
        wmg = WeightedMinHashGenerator(len(self.wlData[0]),
                                       sample_size=2,
                                       seed=12)
        for tr in self.wlData:
            try:
                wm = wmg.minhash(tr)  # wm1 is of the type WeightedMinHash
                vl = np.transpose(wm.hashvalues)
                vl = vl[0]
                self.hash.append(vl.tolist())
            except:
                print(tr)
Exemple #4
0
    def weighed_min_hash(self,
                         num_perm=64,
                         seed=42,
                         use_components=None,
                         type_option=None,
                         n_char=None,
                         n_word=None,
                         npz=None,
                         isrequest=False):
        """ """

        if npz:
            self.options = type_option
            self.num_perm = num_perm

            n = n_char if type_option == 'char' else n_word
            self.features['weighed_{}_{}_{}minhash'.format(use_components[0], type_option[0], n[0])] = \
                np.load(npz)['min_hash']
            return self

        use_components = use_components or ['name']
        type_option = type_option or ['char']
        n_char = n_char or [3]
        n_word = n_word or [1]

        if 'char' not in type_option and 'word' not in type_option:
            assert False, "Проверьте значение параметра type_option."

        if 'name' not in use_components and 'addr' not in use_components:
            assert False, "Проверьте значение параметра use_components."

        self.options = type_option
        self.num_perm = num_perm

        for i in use_components:
            for j in type_option:
                n_list = n_char if j == 'char' else n_word
                for n in n_list:
                    wmg = WeightedMinHashGenerator(len(
                        self.features['tf_idf_{}_{}_{}grams'.format(i, j,
                                                                    n)][0]),
                                                   sample_size=num_perm,
                                                   seed=seed)
                    help_list = []
                    for vector in self.features['tf_idf_{}_{}_{}grams'.format(
                            i, j, n)]:
                        if np.all(vector == 0):
                            vector[0] = 0.000001  # Это костылек.
                        help_list.append(wmg.minhash(vector))
                    self.features['weighed_{}_{}_{}minhash'.format(
                        i, j, n)] = np.array(help_list)
                    file_path = 'data/min_hash_dadata/{}_{}_{}_weighed_minhash.npz'.format(
                        i, j, n)
                    if not isrequest:
                        np.savez_compressed(file_path,
                                            min_hash=np.array(help_list))

                    del self.features['tf_idf_{}_{}_{}grams'.format(i, j, n)]

        return self
Exemple #5
0
 def test_calc_tiny(self):
     v1 = [
         1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10,
         4
     ]
     v2 = [
         2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0,
         0
     ]
     bgen = WeightedMinHashGenerator(len(v1))
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs,
                                        bgen.betas)
     m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (2, 128, 2))
     true_hashes = numpy.array(
         [bgen.minhash(v1).hashvalues,
          bgen.minhash(v2).hashvalues],
         dtype=numpy.uint32)
     self.assertEqual(true_hashes.shape, (2, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
 def avoid_unkown_error(candidate_eids, word_vectors):
     wmg = WeightedMinHashGenerator(word_vectors.shape[1], sample_size=300)
     wm = list()  # Calculating wm takes time!!!
     __candidate_eids = []
     for i, eid in tqdm(zip(range(len(candidate_eids)), candidate_eids),
                        total=len(candidate_eids)):
         try:
             wm.append(wmg.minhash(word_vectors[i]))
             __candidate_eids.append(eid)
         except ValueError as e:
             pass
     return set(__candidate_eids), wmg, wm
Exemple #7
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     mg = WeightedMinHashGenerator(10)
     m1 = mg.minhash(np.random.uniform(1, 10, 10))
     m2 = mg.minhash(np.random.uniform(1, 10, 10))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest2.query(m1, 2)
     self.assertTrue("a" in result)
     self.assertTrue("b" in result)
Exemple #8
0
 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size+1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         forest = MinHashLSHForest(num_perm=128, l=l)
         forest.add("m", m)
         sizes = [len(H) for ht in forest.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))
def tiny_test():
    v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4]
    v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0]

    bgen = WeightedMinHashGenerator(len(v1), 128, 1)

    write_csv_float("tiny-rs.csv", bgen.rs)
    write_csv_float("tiny-ln_cs.csv", bgen.ln_cs)
    write_csv_float("tiny-betas.csv", bgen.betas)

    write_csv_int("tiny-data.csv", [v1, v2])
    write_csv_int("tiny-hashes-0.csv", bgen.minhash(v1).hashvalues)
    write_csv_int("tiny-hashes-1.csv", bgen.minhash(v2).hashvalues)
def run_perf(dim, num_rep, sample_size):
    wmg = WeightedMinHashGenerator(dim, sample_size=sample_size)
    logging.info("WeightedMinHash using %d samples" % sample_size)
    data = np.random.uniform(0, dim, (num_rep, dim)) 
    durs = []
    for i in range(num_rep):
        start = time.clock()
        wmg.minhash(data[i])
        duration = (time.clock() - start) * 1000
        durs.append(duration)
    ave = np.mean(durs)
    logging.info("Generated %d minhashes, average time %.4f ms" % (num_rep, ave))
    return ave
def run_perf(dim, num_rep, sample_size):
    wmg = WeightedMinHashGenerator(dim, sample_size=sample_size)
    logging.info("WeightedMinHash using %d samples" % sample_size)
    data = np.random.uniform(0, dim, (num_rep, dim))
    durs = []
    for i in range(num_rep):
        start = time.clock()
        wmg.minhash(data[i])
        duration = (time.clock() - start) * 1000
        durs.append(duration)
    ave = np.mean(durs)
    logging.info("Generated %d minhashes, average time %.4f ms" %
                 (num_rep, ave))
    return ave
Exemple #12
0
 def __init__(self,staName,force=False):
     self.wlWinN=300
     self.wlLagN=5
     self.fqWinN=300
     self.fqLagN=5
     self.fqRspN=32
     self.wlRspN=32
     self.selmax=90
     self.wl_x_level=3
     self.vectLen=self.fqRspN*self.wlRspN
     self.wmg = WeightedMinHashGenerator(self.vectLen,sample_size=2, seed=12)
     self.sta=False
     self.sphs=self.fqLagN*self.wlLagN/100
     self.GetSta(staName,force=force)
def run_acc(dim, num_rep, sample_size):
    logging.info("WeightedMinHash using %d samples" % sample_size)
    wmg = WeightedMinHashGenerator(dim, sample_size=sample_size)
    data1 = np.random.uniform(0, dim, (num_rep, dim))
    data2 = np.random.uniform(0, dim, (num_rep, dim))
    errs = []
    for i in range(num_rep):
        wm1 = wmg.minhash(data1[i])
        wm2 = wmg.minhash(data2[i])
        j_e = wm1.jaccard(wm2)
        j = jaccard(data1[i], data2[i])
        errs.append(abs(j - j_e))
    ave = np.mean(errs)
    logging.info("%d runs, mean error %.4f" % (num_rep, ave))
    return ave
def run_acc(dim, num_rep, sample_size):
    logging.info("WeightedMinHash using %d samples" % sample_size)
    wmg = WeightedMinHashGenerator(dim, sample_size=sample_size)
    data1 = np.random.uniform(0, dim, (num_rep, dim)) 
    data2 = np.random.uniform(0, dim, (num_rep, dim)) 
    errs = []
    for i in range(num_rep):
        wm1 = wmg.minhash(data1[i])
        wm2 = wmg.minhash(data2[i])
        j_e = wm1.jaccard(wm2)
        j = jaccard(data1[i], data2[i])
        errs.append(abs(j - j_e))
    ave = np.mean(errs)
    logging.info("%d runs, mean error %.4f" % (num_rep, ave))
    return ave
Exemple #15
0
 def test_deferred(self):
     v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4]
     v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     libMHCUDA.minhash_cuda_fini(gen)
     gen = libMHCUDA.minhash_cuda_init(
         len(v1), 128, devices=1, deferred=True, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, *vars)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (2, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues,
                                bgen.minhash(v2).hashvalues], dtype=numpy.uint32)
     self.assertEqual(true_hashes.shape, (2, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Exemple #16
0
 def test_float(self):
     v1 = [
         0,          1.0497366,  0.8494359,  0.66231006, 0.66231006, 0.8494359,
         0,          0.66231006, 0.33652836, 0,           0,         0.5359344,
         0.8494359,  0.66231006, 1.0497366,  0.33652836, 0.66231006, 0.8494359,
         0.6800841,  0.33652836]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, seed=7, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array(v1, dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m).astype(numpy.int32)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (1, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues], dtype=numpy.int32)
     self.assertEqual(true_hashes.shape, (1, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
def big_test():
    numpy.random.seed(0)
    data = numpy.random.randint(0, 100, (6400, 130))
    mask = numpy.random.randint(0, 5, data.shape)
    data *= (mask >= 4)
    del mask
    bgen = WeightedMinHashGenerator(data.shape[-1])

    write_csv_float("big-rs.csv", bgen.rs)
    write_csv_float("big-ln_cs.csv", bgen.ln_cs)
    write_csv_float("big-betas.csv", bgen.betas)

    write_csv_int("big-data.csv", data)

    c = 0
    for line in data:
        write_csv_int("big-hashes-%d.csv" % c, bgen.minhash(line).hashvalues)
        c = c + 1
 def get_250nearestNeighbors(seedEids,
                             candidate_eids,
                             word_vectors,
                             idxByCandEidMap,
                             data,
                             print_info,
                             query_id,
                             choice,
                             wmg=None,
                             wm=None):
     # choice == 1: return 250 nearest Neighbors based on Jaccard Similarity with seeds.
     # choice == 2: return 250 nearest Neighbors based on word embeddings.
     start = time.time()
     if wmg is None or wm is None:
         wmg = WeightedMinHashGenerator(word_vectors.shape[1],
                                        sample_size=300)
         wm = list()  # Calculating wm takes time!!!
         for i in range(len(candidate_eids)):
             wm.append(wmg.minhash(word_vectors[i]))
     distToSeedsByEid = get_distToSeedsByEid(candidate_eids, wm, seedEids,
                                             idxByCandEidMap, data, choice)
     nearestNeighbors = []
     ct = 0
     for eid in sorted(distToSeedsByEid,
                       key=distToSeedsByEid.__getitem__,
                       reverse=True):
         if ct >= 250:
             break
         if eid not in seedEids:
             nearestNeighbors.append(eid)
             ct += 1
     assert ct >= 250
     # print 'Nearest Neighbors are: '
     # for i in nearestNeighbors:
     #     print eidToEntityMap[i],
     # print ' '
     end = time.time()
     print(
         '[utils.py] Done finding 250 nearest neighbors using %.1f seconds'
         % (end - start))
     print_info[query_id] += (
         '[utils.py] Done finding 250 nearest neighbors using %.1f seconds\n'
         % (end - start))
     return nearestNeighbors, wmg, wm
Exemple #19
0
def main():
    data1 = ['this', 'is', 'a', 'pen']
    data2 = ['that', 'is', 'a', 'pen']
    data3 = ['it', 'is', 'a', 'pon']

    m1 = get_minhash(data1)
    m2 = get_minhash(data2)
    m3 = get_minhash(data3)

    print('minhash:')
    print(m1.jaccard(m2))
    print(m1.jaccard(m3))
    print(m2.jaccard(m3))
    print()

    v1 = [1, 2, 3, 4, 5, 6]
    v2 = [2, 5, 7, 9, 11, 13]
    v3 = [1, 2, 3, 4, 5, 7]
    wmg = WeightedMinHashGenerator(len(v1))
    wm1 = wmg.minhash(v1)
    wm2 = wmg.minhash(v2)
    wm3 = wmg.minhash(v3)

    print('weighted minhash:')
    print(wm1.jaccard(wm2))
    print(wm1.jaccard(wm3))
    print(wm2.jaccard(wm3))
    print()

    data = data1 + data2 + data3
    hll = get_hyperloglog(data)
    print('hyperloglog:')
    print(hll.count())
    print()

    hpp = get_hyperloglog_pp(data)
    print('hyperloglog++:')
    print(hpp.count())
    print()

    print('done')
Exemple #20
0
    def test_insert(self):
        forest = MinHashLSHForest()
        mg = WeightedMinHashGenerator(10)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        forest.add("a", m1)
        forest.add("b", m2)

        self.assertTrue(forest.is_empty())
        for t in forest.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in forest)
        self.assertTrue("b" in forest)
        for i, H in enumerate(forest.keys["a"]):
            self.assertTrue("a" in forest.hashtables[i][H])

        forest.index()
        self.assertFalse(forest.is_empty())

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, forest.add, "c", m3)
Exemple #21
0
 def run_test(v):
     k = sum([len(part) for part in v])
     bgen = WeightedMinHashGenerator(len(k))
     gen = libMHCUDA.minhash_cuda_init(len(k), 128, devices=4, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas)
     m = csr_matrix(numpy.array(v, dtype=numpy.float32))
     hashes = None
     try:
         hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     finally:
         self.assertIsNotNone(hashes)
         self.assertEqual(hashes.shape, (1, 128, 2))
         libMHCUDA.minhash_cuda_fini(gen)
Exemple #22
0
 def __init__(self, staName, force=False):
     self.wlWinN = 200
     self.wlLagN = 5
     self.fqWinN = 100
     self.fqLagN = 1
     self.fqRspN = 16
     self.wlRspN = 8
     self.selmax = 50
     self.wl_x_level = 3
     self.cycle = 0
     self.num_perm = 20
     self.vectLen = self.fqRspN * self.wlRspN
     self.wmg = WeightedMinHashGenerator(self.selmax,
                                         sample_size=10,
                                         seed=12)
     self.sta = False
     self.sphs = self.fqLagN * self.wlLagN / 100
     self.GetSta(staName, force=force)
Exemple #23
0
    def test_query(self):
        forest = MinHashLSHForest()
        mg = WeightedMinHashGenerator(10)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        forest.add("a", m1)
        forest.add("b", m2)
        forest.index()
        result = forest.query(m1, 2)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, forest.query, m3, 1)
    def compare(self, num_bits):
        """compares results from different methods"""
        self.wmg = WeightedMinHashGenerator(self.num_col,
                                            sample_size=num_bits,
                                            seed=self.rnd_seed)

        results = defaultdict(lambda: np.zeros((self.params['test_size'], )))

        combs = self.prepare_test_set()

        start = time()
        for i, comb in enumerate(combs):
            title_x, title_y = comb
            arr_x = self.dataset.loc[title_x].values
            arr_y = self.dataset.loc[title_y].values

            # ground truth
            results['gt'][i] = cosine_similarity(arr_x.reshape(1, -1),
                                                 arr_y.reshape(1, -1))[0][0]

            results['minhash'][i] = self.compute_jacc(arr_x, arr_y,
                                                      'datasketch')

            # neural hashes
            hash_x = self.neural_hashes.loc[title_x].values
            hash_y = self.neural_hashes.loc[title_y].values

            results['neural'][i] = self.compute_jacc(hash_x, hash_y, 'sklearn')

            if i % 100 == 0:
                print(f'Completed {i}/{self.params["test_size"]} samples.',
                      end='\r')

        print(f'Computing similarity took {time() - start} seconds.')

        scores = dict()
        scores['minhash'] = self.compute_mae(results['gt'], results['minhash'])
        scores['neural'] = self.compute_mae(results['gt'], results['neural'])

        return scores
Exemple #25
0
def getICWSGenerator(k: int, s: int):
    from datasketch import WeightedMinHashGenerator
    return WeightedMinHashGenerator(4 ** k, sample_size=s)
# import libraries
import pandas as pd
from datasketch import WeightedMinHashGenerator
from datasketch import MinHashLSHForest

# read the pickle file that was created by 'tfidf.py' script
# this file contains the weighted representation of all products in 'Bestprice' dataset
df_tfidf = pd.read_pickle('data/tfidfs/df_tfidf_brand_1-0.pkl')

# create an extra column with the minhash id (m1, m2 etc)
df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x))

# create a WeightedMinHashGenerator object with the appropriate arguments
# dim: dimension - the number of unique terms
# sample_size: number of samples (similar to number of permutation functions in MinHash)
mg = WeightedMinHashGenerator(dim=35405, sample_size=128)

def create_minhash(doc):
	"""
	This function takes the weighted representation of a product and returns its Minhash signature.
    :param doc: The weighted representation of the product
    :return: The Minhash signature of the product as a Minhash object
	"""
    term_ids = doc['term_id']
    tfidfs = doc['tfidf']
    tfidf_list = [0]*35405
    
    i = 0
    for term_id in term_ids:
        tfidf_list[term_id] = tfidfs[i]
        i += 1
from datasketch import WeightedMinHashGenerator
import numpy as np

v1 = [1, 3, 4, 5, 6, 7, 8, 9, 10, 4]
v2 = [2, 4, 3, 8, 4, 7, 10, 9, 0, 0]

min_sum = np.sum(np.minimum(v1, v2))
max_sum = np.sum(np.maximum(v1, v2))
true_jaccard = float(min_sum) / float(max_sum)

wmg = WeightedMinHashGenerator(len(v1))
wm1 = wmg.minhash(v1)
wm2 = wmg.minhash(v2)
print("Estimated Jaccard is", wm1.jaccard(wm2))
print("True Jaccard is", true_jaccard)
class SacFig():
    def GetData(self, file):
        st = pread(file)
        return st[0].data

    def __init__(self, staName, force=False):
        self.wlWinN = 300
        self.wlLagN = 10
        self.fqWinN = 300
        self.fqLagN = 10
        self.fqRspN = 32
        self.wlRspN = 32
        self.selmax = 30
        self.wl_x_level = 3
        self.vectLen = self.fqRspN * self.wlRspN
        self.wmg = WeightedMinHashGenerator(self.vectLen,
                                            sample_size=2,
                                            seed=12)
        self.sta = False
        self.sphs = self.fqLagN * self.wlLagN / 100
        self.GetSta(staName, force=force)
        #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z")
    def GetHash(self):
        self.hash = []
        step = self.fqLagN * self.wlLagN
        for itr in range(self.datalen):
            data = self.STFT(self.dt, itr * step)
            data = self.WAVELET(data, self.wl_x_level)
            data = self.REGU(data)
            data = self.TRIM(data, self.selmax)
            data = self.FIG(data)
            self.hash.append(data)
        return self.hash

    def GetHashTofile(self, fileName, outfile, force=False):
        ext = [
            DIR + "s28/2015336/2015336_00_00_00_s28_BHZ.SAC",
            DIR + "s28/2015336/2015336_00_00_00_s28_BHN.SAC",
            DIR + "s28/2015336/2015336_00_00_00_s28_BHE.SAC"
        ]
        dt = []
        if (len(self.GetData(fileName[0])) < 10000):
            #print(len(self.GetData(fileName[0])<10000))
            for fn in range(len(fileName)):
                ddt = self.GetData(ext[fn])[:10000]
                cp = len(self.GetData(fileName[fn]))
                ddt[:cp] = ddt[:cp] + np.zeros(
                    [cp])  #+self.GetData(fileName[fn])
                dt.append(ddt)
        else:
            for fn in fileName:
                dt.append(self.GetData(fn)[:500000])
        datalen = int((len(dt[0]) - self.wlLagN * self.wlWinN - self.fqWinN) /
                      self.fqLagN / self.wlLagN)
        file = open(outfile, "w")
        step = self.fqLagN * self.wlLagN
        for itr in range(datalen):
            data = self.STFT(dt, itr * step)
            data = self.WAVELET(data, 3)
            data = self.REGU(data)
            data = self.TRIM(data, self.selmax)
            try:
                data = self.FIG(data)
            except:
                data = [0, 0]
            tm = itr * self.sphs
            file.write("%f," % (tm))
            for itr in data:
                file.write("%d," % itr)
            file.write("\n")
        file.close()

    def GetSta(self, fileName, force=False):
        if (force == True):
            self.sta = False
            if (os.path.exists("stat.out.npz") == True):
                os.remove("stat.out.npz")
        if (self.sta == True):
            return
        if (os.path.exists("stat.out.npz") == True):
            data = np.load("stat.out.npz")
            self.mu = data['a']
            self.sigma = data['b']
            self.sta = True
            return
        dt_for_sta = np.zeros([4000, self.vectLen])
        dt = []
        for fn in fileName:
            dt.append(self.GetData(fn))
        for idx in range(4000):
            data = self.STFT(dt, idx * 1000)
            dt_for_sta[idx, :] = self.WAVELET(data, 3)
        self.mu = np.average(dt_for_sta, axis=0)
        self.sigma = np.std(dt_for_sta, axis=0)
        np.savez("stat.out.npz", a=self.mu, b=self.sigma)
        self.sta = True

    def STFT(self, xx, idx):
        fqWin = int(self.fqWinN / 2)
        sumx = np.zeros([self.wlWinN, fqWin])
        tpx = np.zeros([self.wlWinN, self.fqWinN])
        for itx in xx:
            w = np.hanning(self.fqWinN)
            tpx[:, :] = np.zeros([self.wlWinN, self.fqWinN])
            for ii in range(self.wlWinN):
                start = ii * self.fqLagN + idx
                tpx[ii, :] = itx[start:start + self.fqWinN]
            X = scipy.fft(tpx, axis=1)
            X = (X * w)[:, :fqWin]
            X = np.square(np.abs(X))
            sumx = np.add(sumx, X)
        sumx = np.sqrt(sumx)
        sumx = resample(sumx, self.wlRspN, axis=0)
        sumx = resample(sumx, self.fqRspN, axis=1)
        return sumx

    def WAVELET(self, data, level):
        outdt = pywt.wavedec2(data, 'haar', level=level)
        out = np.zeros([self.vectLen])
        idx = 0
        for itr in outdt:
            evaldt = np.reshape(itr, [-1])
            cit = len(evaldt)
            out[idx:idx + cit] = evaldt[:]
            idx += cit
        out2 = np.sqrt(np.sum(np.square(out)))
        out = np.divide(out, out2)
        return out

    def REGU(self, data):
        rdata = data - self.mu
        rdata = np.divide(rdata, self.sigma)
        return rdata

    def TRIM(self, data, nlarge):
        absdata = np.abs(data)
        large = heapq.nlargest(nlarge,
                               range(len(data)),
                               key=lambda x: absdata[x])
        lst = np.zeros_like(data)
        for itr in large:
            lst[itr] = data[itr] / absdata[itr]
        return lst

    def FIG(self, tr):
        wm = self.wmg.minhash(tr)  # wm1 is of the type WeightedMinHash
        vl = np.transpose(wm.hashvalues)
        vl = vl[0]
        return (vl.tolist())
def create_weighted_minhash(data):
    global wmg
    if not wmg:
        wmg = WeightedMinHashGenerator(len(data), 128, seed=12)
    minhash = wmg.minhash(data)
    return minhash
Exemple #30
0
    def __init__(self,
                 tarfile_ortho=None,
                 h5_oma=None,
                 taxa=None,
                 masterTree=None,
                 saving_name=None,
                 numperm=256,
                 treeweights=None,
                 taxfilter=None,
                 taxmask=None,
                 verbose=False):
        if h5_oma is not None:
            from pyoma.browser import db
            self.h5OMA = h5_oma
            self.db_obj = db.Database(h5_oma)
            self.oma_id_obj = db.OmaIdMapper(self.db_obj)

        elif tarfile_ortho:
            self.tar = tarfile_ortho
            self.h5OMA = None
            self.db_obj = None
            self.oma_id_obj = None

        self.tax_filter = taxfilter
        self.tax_mask = taxmask
        self.verbose = verbose
        self.datetime = datetime
        self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now())
        self.saving_name = saving_name

        #original_umask = os.umask(0)

        if saving_name:
            self.saving_path = config_utils.datadir + saving_name + '/'
            if not os.path.isdir(self.saving_path):
                os.mkdir(path=self.saving_path)
        else:

            self.saving_path = config_utils.datadir + self.date_string + '/'
            if not os.path.isdir(self.saving_path):

                os.mkdir(path=self.saving_path)

        if masterTree is None:
            if h5_oma:
                genomes = pd.DataFrame(
                    h5_oma.root.Genome.read())["NCBITaxonId"].tolist()
                genomes = [str(g) for g in genomes]
                taxa = genomes + [131567, 2759, 2157, 45596] + [
                    taxrel[0] for taxrel in list(h5_oma.root.Taxonomy[:])
                ] + [taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:])]
                self.tree_string, self.tree_ete3 = files_utils.get_tree(
                    taxa=taxa, genomes=genomes, savename=saving_name)
            elif taxa:
                with open(taxa, 'r') as taxin:
                    taxlist = [int(line) for line in taxin]
                self.tree_string, self.tree_ete3 = files_utils.get_tree(
                    taxa=taxlist, savename=saving_name)
            else:
                raise Exception(
                    'please specify either a list of taxa or a tree')
        elif mastertree:
            with open(masterTree, 'wb') as pklin:
                self.tree_ete3 = pickle.loads(pklin.read())
                self.tree_string = self.tree_ete3.write(format=1)

        self.taxaIndex, self.reverse = files_utils.generate_taxa_index(
            self.tree_ete3, self.tax_filter, self.tax_mask)
        with open(config_utils.datadir + 'taxaIndex.pkl', 'wb') as taxout:
            taxout.write(pickle.dumps(self.taxaIndex))
        self.numperm = numperm
        if treeweights is None:
            #generate aconfig_utilsll ones
            self.treeweights = hashutils.generate_treeweights(
                self.tree_ete3, self.taxaIndex, taxfilter, taxmask)
        else:
            #load machine learning weights
            self.treeweights = treeweights
        print(self.treeweights)
        wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex),
                                       sample_size=numperm,
                                       seed=1)
        with open(self.saving_path + saving_name + 'wmg.pkl', 'wb') as taxout:
            taxout.write(pickle.dumps(self.taxaIndex))

        self.wmg = wmg
        self.HAM_PIPELINE = functools.partial(
            pyhamutils.get_ham_treemap_from_row, tree=self.tree_string)
        self.HASH_PIPELINE = functools.partial(hashutils.row2hash,
                                               taxaIndex=self.taxaIndex,
                                               treeweights=self.treeweights,
                                               wmg=wmg)
        if self.h5OMA:
            self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma,
                                                db_obj=self.db_obj)
        elif self.tar:
            self.READ_ORTHO = pyhamutils.get_orthoxml
        self.hashes_path = self.saving_path + 'hashes.h5'
        self.lshpath = self.saving_path + 'newlsh.pkl'
        self.lshforestpath = self.saving_path + 'newlshforest.pkl'
        self.mat_path = self.saving_path + 'hogmat.h5'
        self.columns = len(self.taxaIndex)
Exemple #31
0
def calcuJaccard(hashSig1, hashSig2):
    return np.float(np.count_nonzero(hashSig1 == hashSig2)) /\
                np.float(len(hashSig1))


if __name__ == '__main__':

    data1 = [
        'TAG', 'DSA', 'FDV', 'GFG', 'TRE', 'EWE', 'QWW', 'RFR', 'QWE', 'ZAW',
        'CDS', 'NBH'
    ]
    data2 = [
        'TAG', 'DSA', 'FDV', 'GFG', 'TRE', 'EWE', 'QWW', 'RFR', 'SDA', 'ZAW',
        'CDS', 'NBH'
    ]

    m1, m2 = MinHash(), MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    data3 = [2, 0, 0, 0, 0, 3, 0, 0, 2, 0, 2, 0, 0]
    data4 = [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]
    weight = WeightedMinHashGenerator(len(data3))
    w1 = weight.minhash(data3)
    w2 = weight.minhash(data4)
    print("Weighted Jac ", w1.jaccard(w2))
Exemple #32
0
class SacFig():
    def GetData(self, file):
        st = pread(file)
        return st[0].data

    def __init__(self, fileName):
        self.wlWinN = 200
        self.wlLagN = 10
        self.fqWinN = 200
        self.fqLagN = 10
        self.fqRspN = 32
        self.wlRspN = 32
        self.selmax = 50
        self.wl_x_level = 3
        self.vectLen = self.fqRspN * self.wlRspN
        self.wmg = WeightedMinHashGenerator(self.vectLen,
                                            sample_size=2,
                                            seed=12)
        self.sta = False
        self.sphs = self.fqLagN * self.wlLagN / 100
        self.GetSta(fileName, force=True)
        for fn in fileName:
            self.dt = self.GetData(fileName)[:500000]
        self.datalen = int(
            (len(self.dt) - self.wlLagN * self.wlWinN - self.fqWinN) /
            self.fqLagN / self.wlLagN)
        #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z")
        tpd = np.load("template.npz")
        tempdata = tpd['c']
        for itrx in range(6):
            itr = np.random.randint(100)
            start = itrx * 50000 + 127
            end = start + len(tempdata[itr])
            #print(start,end,itr,np.shape(tempdata[itr]),np.shape(self.dt[start:end]))
            self.dt[start:end] = self.dt[start:end] + tempdata[itr] * 0.1
        x = np.linspace(0, 5000, len(self.dt))
        plt.plot(x, self.dt)
        plt.show()

    def GetHash(self):
        self.hash = []
        step = self.fqLagN * self.wlLagN
        for itr in range(self.datalen):
            data = self.STFT(self.dt, itr * step)
            data = self.WAVELET(data, self.wl_x_level)
            data = self.REGU(data)
            data = self.TRIM(data, self.selmax)
            data = self.FIG(data)
            self.hash.append(data)
        return self.hash

    def GetHashTofile(self, file):
        self.hash = []
        step = self.fqLagN * self.wlLagN
        for itr in range(self.datalen):
            data = self.STFT(self.dt, itr * step)
            data = self.WAVELET(data, 3)
            data = self.REGU(data)
            data = self.TRIM(data, self.selmax)
            data = self.FIG(data)
            self.hash.append(data)
            tm = itr * self.sphs
            file.write("%f," % (tm))
            for itr in data:
                file.write("%d," % itr)
            file.write("\n")

    def GetSta(self, fileName, force=False):
        if (force == True):
            self.sta = False
            os.remove("stat.out.npz")
        if (self.sta == True):
            return
        if (os.path.exists("stat.out.npz") == True):
            data = np.load("stat.out.npz")
            self.mu = data['a']
            self.sigma = data['b']
            self.sta = True
            return
        dt_for_sta = np.zeros([4000, self.vectLen])
        dta = self.GetData(fileName)
        for idx in range(4000):
            dt = self.STFT(dta, idx * 1000)
            dt_for_sta[idx, :] = self.WAVELET(dt, 3)
        self.mu = np.average(dt_for_sta, axis=0)
        self.sigma = np.std(dt_for_sta, axis=0)
        np.savez("stat.out.npz", a=self.mu, b=self.sigma)
        self.sta = True

    def STFT(self, x, idx):
        w = np.hanning(self.fqWinN)
        fqWin = int(self.fqWinN / 2)
        X = np.zeros([self.wlWinN, self.fqWinN])
        for ii in range(self.wlWinN):
            start = ii * self.fqLagN + idx
            X[ii, :] = x[start:start + self.fqWinN] * w
        X = scipy.fft(X, axis=1)[:fqWin]
        X = np.abs(np.array(X))
        X = resample(X, self.wlRspN, axis=0)
        X = resample(X, self.fqRspN, axis=1)
        return X

    def WAVELET(self, data, level):
        outdt = pywt.wavedec2(data, 'haar', level=level)
        out = np.zeros([self.vectLen])
        idx = 0
        for itr in outdt:
            evaldt = np.reshape(itr, [-1])
            cit = len(evaldt)
            out[idx:idx + cit] = evaldt[:]
            idx += cit
        out2 = np.sqrt(np.sum(np.square(out)))
        out = np.divide(out, out2)
        return out

    def REGU(self, data):
        rdata = data - self.mu
        rdata = np.divide(rdata, self.sigma)
        return rdata

    def TRIM(self, data, nlarge):
        absdata = np.abs(data)
        large = heapq.nlargest(nlarge,
                               range(len(data)),
                               key=lambda x: absdata[x])
        lst = np.zeros_like(data)
        for itr in large:
            lst[itr] = data[itr] / absdata[itr]
        return lst

    def FIG(self, tr):
        wm = self.wmg.minhash(tr)  # wm1 is of the type WeightedMinHash
        vl = np.transpose(wm.hashvalues)
        vl = vl[0]
        return (vl.tolist())