def _test_calc_big(self, devices): numpy.random.seed(0) data = numpy.random.randint(0, 100, (6400, 130)) mask = numpy.random.randint(0, 5, data.shape) data *= (mask >= 4) del mask bgen = WeightedMinHashGenerator(data.shape[-1]) gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, devices=devices, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(data, dtype=numpy.float32) print(m.nnz / (m.shape[0] * m.shape[1])) ts = time() hashes = libMHCUDA.minhash_cuda_calc(gen, m) print("libMHCUDA:", time() - ts) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (len(data), 128, 2)) ts = time() true_hashes = numpy.array([bgen.minhash(line).hashvalues for line in data], dtype=numpy.uint32) print("datasketch:", time() - ts) self.assertEqual(true_hashes.shape, (len(data), 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: for r in range(hashes.shape[0]): if (hashes[r] != true_hashes[r]).any(): print("first invalid row:", r) print(hashes[r]) print(true_hashes[r]) break raise e from None
def __init__(self, fileName): self.wlWinN = 200 self.wlLagN = 10 self.fqWinN = 200 self.fqLagN = 10 self.fqRspN = 32 self.wlRspN = 32 self.selmax = 50 self.wl_x_level = 3 self.vectLen = self.fqRspN * self.wlRspN self.wmg = WeightedMinHashGenerator(self.vectLen, sample_size=2, seed=12) self.sta = False self.sphs = self.fqLagN * self.wlLagN / 100 self.GetSta(fileName, force=True) for fn in fileName: self.dt = self.GetData(fileName)[:500000] self.datalen = int( (len(self.dt) - self.wlLagN * self.wlWinN - self.fqWinN) / self.fqLagN / self.wlLagN) #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z") tpd = np.load("template.npz") tempdata = tpd['c'] for itrx in range(6): itr = np.random.randint(100) start = itrx * 50000 + 127 end = start + len(tempdata[itr]) #print(start,end,itr,np.shape(tempdata[itr]),np.shape(self.dt[start:end])) self.dt[start:end] = self.dt[start:end] + tempdata[itr] * 0.1 x = np.linspace(0, 5000, len(self.dt)) plt.plot(x, self.dt) plt.show()
def GetFingerPoint(self, hashbit): import simhash import mynilsimsa from datasketch import WeightedMinHashGenerator """ schar=[] for iy in range(len(self.wlData)): tsc=[] for ix in range(len(self.wlData[0])): if(self.wlData[iy,ix]==1): tsc.append('a') elif(self.wlData[iy,ix]==-1): tsc.append('c') else: tsc.append('d') schar.append(tsc) for cr in schar: hh=simhash.simhash(''.join(cr),hashbits=hashbit) self.hash.append(hh.hash) """ #""" wmg = WeightedMinHashGenerator(len(self.wlData[0]), sample_size=2, seed=12) for tr in self.wlData: try: wm = wmg.minhash(tr) # wm1 is of the type WeightedMinHash vl = np.transpose(wm.hashvalues) vl = vl[0] self.hash.append(vl.tolist()) except: print(tr)
def weighed_min_hash(self, num_perm=64, seed=42, use_components=None, type_option=None, n_char=None, n_word=None, npz=None, isrequest=False): """ """ if npz: self.options = type_option self.num_perm = num_perm n = n_char if type_option == 'char' else n_word self.features['weighed_{}_{}_{}minhash'.format(use_components[0], type_option[0], n[0])] = \ np.load(npz)['min_hash'] return self use_components = use_components or ['name'] type_option = type_option or ['char'] n_char = n_char or [3] n_word = n_word or [1] if 'char' not in type_option and 'word' not in type_option: assert False, "Проверьте значение параметра type_option." if 'name' not in use_components and 'addr' not in use_components: assert False, "Проверьте значение параметра use_components." self.options = type_option self.num_perm = num_perm for i in use_components: for j in type_option: n_list = n_char if j == 'char' else n_word for n in n_list: wmg = WeightedMinHashGenerator(len( self.features['tf_idf_{}_{}_{}grams'.format(i, j, n)][0]), sample_size=num_perm, seed=seed) help_list = [] for vector in self.features['tf_idf_{}_{}_{}grams'.format( i, j, n)]: if np.all(vector == 0): vector[0] = 0.000001 # Это костылек. help_list.append(wmg.minhash(vector)) self.features['weighed_{}_{}_{}minhash'.format( i, j, n)] = np.array(help_list) file_path = 'data/min_hash_dadata/{}_{}_{}_weighed_minhash.npz'.format( i, j, n) if not isrequest: np.savez_compressed(file_path, min_hash=np.array(help_list)) del self.features['tf_idf_{}_{}_{}grams'.format(i, j, n)] return self
def test_calc_tiny(self): v1 = [ 1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4 ] v2 = [ 2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0 ] bgen = WeightedMinHashGenerator(len(v1)) gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (2, 128, 2)) true_hashes = numpy.array( [bgen.minhash(v1).hashvalues, bgen.minhash(v2).hashvalues], dtype=numpy.uint32) self.assertEqual(true_hashes.shape, (2, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def avoid_unkown_error(candidate_eids, word_vectors): wmg = WeightedMinHashGenerator(word_vectors.shape[1], sample_size=300) wm = list() # Calculating wm takes time!!! __candidate_eids = [] for i, eid in tqdm(zip(range(len(candidate_eids)), candidate_eids), total=len(candidate_eids)): try: wm.append(wmg.minhash(word_vectors[i])) __candidate_eids.append(eid) except ValueError as e: pass return set(__candidate_eids), wmg, wm
def test_pickle(self): forest = MinHashLSHForest() mg = WeightedMinHashGenerator(10) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) forest.add("a", m1) forest.add("b", m2) forest.index() forest2 = pickle.loads(pickle.dumps(forest)) result = forest2.query(m1, 2) self.assertTrue("a" in result) self.assertTrue("b" in result)
def test__H(self): ''' Check _H output consistent bytes length given the same concatenated hash value size ''' mg = WeightedMinHashGenerator(100, sample_size=128) for l in range(2, mg.sample_size+1, 16): m = mg.minhash(np.random.randint(1, 99999999, 100)) forest = MinHashLSHForest(num_perm=128, l=l) forest.add("m", m) sizes = [len(H) for ht in forest.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes))
def tiny_test(): v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4] v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0] bgen = WeightedMinHashGenerator(len(v1), 128, 1) write_csv_float("tiny-rs.csv", bgen.rs) write_csv_float("tiny-ln_cs.csv", bgen.ln_cs) write_csv_float("tiny-betas.csv", bgen.betas) write_csv_int("tiny-data.csv", [v1, v2]) write_csv_int("tiny-hashes-0.csv", bgen.minhash(v1).hashvalues) write_csv_int("tiny-hashes-1.csv", bgen.minhash(v2).hashvalues)
def run_perf(dim, num_rep, sample_size): wmg = WeightedMinHashGenerator(dim, sample_size=sample_size) logging.info("WeightedMinHash using %d samples" % sample_size) data = np.random.uniform(0, dim, (num_rep, dim)) durs = [] for i in range(num_rep): start = time.clock() wmg.minhash(data[i]) duration = (time.clock() - start) * 1000 durs.append(duration) ave = np.mean(durs) logging.info("Generated %d minhashes, average time %.4f ms" % (num_rep, ave)) return ave
def __init__(self,staName,force=False): self.wlWinN=300 self.wlLagN=5 self.fqWinN=300 self.fqLagN=5 self.fqRspN=32 self.wlRspN=32 self.selmax=90 self.wl_x_level=3 self.vectLen=self.fqRspN*self.wlRspN self.wmg = WeightedMinHashGenerator(self.vectLen,sample_size=2, seed=12) self.sta=False self.sphs=self.fqLagN*self.wlLagN/100 self.GetSta(staName,force=force)
def run_acc(dim, num_rep, sample_size): logging.info("WeightedMinHash using %d samples" % sample_size) wmg = WeightedMinHashGenerator(dim, sample_size=sample_size) data1 = np.random.uniform(0, dim, (num_rep, dim)) data2 = np.random.uniform(0, dim, (num_rep, dim)) errs = [] for i in range(num_rep): wm1 = wmg.minhash(data1[i]) wm2 = wmg.minhash(data2[i]) j_e = wm1.jaccard(wm2) j = jaccard(data1[i], data2[i]) errs.append(abs(j - j_e)) ave = np.mean(errs) logging.info("%d runs, mean error %.4f" % (num_rep, ave)) return ave
def test_deferred(self): v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4] v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0] gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2) vars = libMHCUDA.minhash_cuda_retrieve_vars(gen) libMHCUDA.minhash_cuda_fini(gen) gen = libMHCUDA.minhash_cuda_init( len(v1), 128, devices=1, deferred=True, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, *vars) bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator) bgen.dim = len(v1) bgen.rs, bgen.ln_cs, bgen.betas = vars bgen.sample_size = 128 bgen.seed = None m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (2, 128, 2)) true_hashes = numpy.array([bgen.minhash(v1).hashvalues, bgen.minhash(v2).hashvalues], dtype=numpy.uint32) self.assertEqual(true_hashes.shape, (2, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def test_float(self): v1 = [ 0, 1.0497366, 0.8494359, 0.66231006, 0.66231006, 0.8494359, 0, 0.66231006, 0.33652836, 0, 0, 0.5359344, 0.8494359, 0.66231006, 1.0497366, 0.33652836, 0.66231006, 0.8494359, 0.6800841, 0.33652836] gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, seed=7, verbosity=2) vars = libMHCUDA.minhash_cuda_retrieve_vars(gen) bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator) bgen.dim = len(v1) bgen.rs, bgen.ln_cs, bgen.betas = vars bgen.sample_size = 128 bgen.seed = None m = csr_matrix(numpy.array(v1, dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m).astype(numpy.int32) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (1, 128, 2)) true_hashes = numpy.array([bgen.minhash(v1).hashvalues], dtype=numpy.int32) self.assertEqual(true_hashes.shape, (1, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def big_test(): numpy.random.seed(0) data = numpy.random.randint(0, 100, (6400, 130)) mask = numpy.random.randint(0, 5, data.shape) data *= (mask >= 4) del mask bgen = WeightedMinHashGenerator(data.shape[-1]) write_csv_float("big-rs.csv", bgen.rs) write_csv_float("big-ln_cs.csv", bgen.ln_cs) write_csv_float("big-betas.csv", bgen.betas) write_csv_int("big-data.csv", data) c = 0 for line in data: write_csv_int("big-hashes-%d.csv" % c, bgen.minhash(line).hashvalues) c = c + 1
def get_250nearestNeighbors(seedEids, candidate_eids, word_vectors, idxByCandEidMap, data, print_info, query_id, choice, wmg=None, wm=None): # choice == 1: return 250 nearest Neighbors based on Jaccard Similarity with seeds. # choice == 2: return 250 nearest Neighbors based on word embeddings. start = time.time() if wmg is None or wm is None: wmg = WeightedMinHashGenerator(word_vectors.shape[1], sample_size=300) wm = list() # Calculating wm takes time!!! for i in range(len(candidate_eids)): wm.append(wmg.minhash(word_vectors[i])) distToSeedsByEid = get_distToSeedsByEid(candidate_eids, wm, seedEids, idxByCandEidMap, data, choice) nearestNeighbors = [] ct = 0 for eid in sorted(distToSeedsByEid, key=distToSeedsByEid.__getitem__, reverse=True): if ct >= 250: break if eid not in seedEids: nearestNeighbors.append(eid) ct += 1 assert ct >= 250 # print 'Nearest Neighbors are: ' # for i in nearestNeighbors: # print eidToEntityMap[i], # print ' ' end = time.time() print( '[utils.py] Done finding 250 nearest neighbors using %.1f seconds' % (end - start)) print_info[query_id] += ( '[utils.py] Done finding 250 nearest neighbors using %.1f seconds\n' % (end - start)) return nearestNeighbors, wmg, wm
def main(): data1 = ['this', 'is', 'a', 'pen'] data2 = ['that', 'is', 'a', 'pen'] data3 = ['it', 'is', 'a', 'pon'] m1 = get_minhash(data1) m2 = get_minhash(data2) m3 = get_minhash(data3) print('minhash:') print(m1.jaccard(m2)) print(m1.jaccard(m3)) print(m2.jaccard(m3)) print() v1 = [1, 2, 3, 4, 5, 6] v2 = [2, 5, 7, 9, 11, 13] v3 = [1, 2, 3, 4, 5, 7] wmg = WeightedMinHashGenerator(len(v1)) wm1 = wmg.minhash(v1) wm2 = wmg.minhash(v2) wm3 = wmg.minhash(v3) print('weighted minhash:') print(wm1.jaccard(wm2)) print(wm1.jaccard(wm3)) print(wm2.jaccard(wm3)) print() data = data1 + data2 + data3 hll = get_hyperloglog(data) print('hyperloglog:') print(hll.count()) print() hpp = get_hyperloglog_pp(data) print('hyperloglog++:') print(hpp.count()) print() print('done')
def test_insert(self): forest = MinHashLSHForest() mg = WeightedMinHashGenerator(10) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) forest.add("a", m1) forest.add("b", m2) self.assertTrue(forest.is_empty()) for t in forest.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue("a" in forest) self.assertTrue("b" in forest) for i, H in enumerate(forest.keys["a"]): self.assertTrue("a" in forest.hashtables[i][H]) forest.index() self.assertFalse(forest.is_empty()) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, forest.add, "c", m3)
def run_test(v): k = sum([len(part) for part in v]) bgen = WeightedMinHashGenerator(len(k)) gen = libMHCUDA.minhash_cuda_init(len(k), 128, devices=4, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(numpy.array(v, dtype=numpy.float32)) hashes = None try: hashes = libMHCUDA.minhash_cuda_calc(gen, m) finally: self.assertIsNotNone(hashes) self.assertEqual(hashes.shape, (1, 128, 2)) libMHCUDA.minhash_cuda_fini(gen)
def __init__(self, staName, force=False): self.wlWinN = 200 self.wlLagN = 5 self.fqWinN = 100 self.fqLagN = 1 self.fqRspN = 16 self.wlRspN = 8 self.selmax = 50 self.wl_x_level = 3 self.cycle = 0 self.num_perm = 20 self.vectLen = self.fqRspN * self.wlRspN self.wmg = WeightedMinHashGenerator(self.selmax, sample_size=10, seed=12) self.sta = False self.sphs = self.fqLagN * self.wlLagN / 100 self.GetSta(staName, force=force)
def test_query(self): forest = MinHashLSHForest() mg = WeightedMinHashGenerator(10) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) forest.add("a", m1) forest.add("b", m2) forest.index() result = forest.query(m1, 2) self.assertTrue("a" in result) self.assertTrue("b" in result) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, forest.query, m3, 1)
def compare(self, num_bits): """compares results from different methods""" self.wmg = WeightedMinHashGenerator(self.num_col, sample_size=num_bits, seed=self.rnd_seed) results = defaultdict(lambda: np.zeros((self.params['test_size'], ))) combs = self.prepare_test_set() start = time() for i, comb in enumerate(combs): title_x, title_y = comb arr_x = self.dataset.loc[title_x].values arr_y = self.dataset.loc[title_y].values # ground truth results['gt'][i] = cosine_similarity(arr_x.reshape(1, -1), arr_y.reshape(1, -1))[0][0] results['minhash'][i] = self.compute_jacc(arr_x, arr_y, 'datasketch') # neural hashes hash_x = self.neural_hashes.loc[title_x].values hash_y = self.neural_hashes.loc[title_y].values results['neural'][i] = self.compute_jacc(hash_x, hash_y, 'sklearn') if i % 100 == 0: print(f'Completed {i}/{self.params["test_size"]} samples.', end='\r') print(f'Computing similarity took {time() - start} seconds.') scores = dict() scores['minhash'] = self.compute_mae(results['gt'], results['minhash']) scores['neural'] = self.compute_mae(results['gt'], results['neural']) return scores
def getICWSGenerator(k: int, s: int): from datasketch import WeightedMinHashGenerator return WeightedMinHashGenerator(4 ** k, sample_size=s)
# import libraries import pandas as pd from datasketch import WeightedMinHashGenerator from datasketch import MinHashLSHForest # read the pickle file that was created by 'tfidf.py' script # this file contains the weighted representation of all products in 'Bestprice' dataset df_tfidf = pd.read_pickle('data/tfidfs/df_tfidf_brand_1-0.pkl') # create an extra column with the minhash id (m1, m2 etc) df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x)) # create a WeightedMinHashGenerator object with the appropriate arguments # dim: dimension - the number of unique terms # sample_size: number of samples (similar to number of permutation functions in MinHash) mg = WeightedMinHashGenerator(dim=35405, sample_size=128) def create_minhash(doc): """ This function takes the weighted representation of a product and returns its Minhash signature. :param doc: The weighted representation of the product :return: The Minhash signature of the product as a Minhash object """ term_ids = doc['term_id'] tfidfs = doc['tfidf'] tfidf_list = [0]*35405 i = 0 for term_id in term_ids: tfidf_list[term_id] = tfidfs[i] i += 1
from datasketch import WeightedMinHashGenerator import numpy as np v1 = [1, 3, 4, 5, 6, 7, 8, 9, 10, 4] v2 = [2, 4, 3, 8, 4, 7, 10, 9, 0, 0] min_sum = np.sum(np.minimum(v1, v2)) max_sum = np.sum(np.maximum(v1, v2)) true_jaccard = float(min_sum) / float(max_sum) wmg = WeightedMinHashGenerator(len(v1)) wm1 = wmg.minhash(v1) wm2 = wmg.minhash(v2) print("Estimated Jaccard is", wm1.jaccard(wm2)) print("True Jaccard is", true_jaccard)
class SacFig(): def GetData(self, file): st = pread(file) return st[0].data def __init__(self, staName, force=False): self.wlWinN = 300 self.wlLagN = 10 self.fqWinN = 300 self.fqLagN = 10 self.fqRspN = 32 self.wlRspN = 32 self.selmax = 30 self.wl_x_level = 3 self.vectLen = self.fqRspN * self.wlRspN self.wmg = WeightedMinHashGenerator(self.vectLen, sample_size=2, seed=12) self.sta = False self.sphs = self.fqLagN * self.wlLagN / 100 self.GetSta(staName, force=force) #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z") def GetHash(self): self.hash = [] step = self.fqLagN * self.wlLagN for itr in range(self.datalen): data = self.STFT(self.dt, itr * step) data = self.WAVELET(data, self.wl_x_level) data = self.REGU(data) data = self.TRIM(data, self.selmax) data = self.FIG(data) self.hash.append(data) return self.hash def GetHashTofile(self, fileName, outfile, force=False): ext = [ DIR + "s28/2015336/2015336_00_00_00_s28_BHZ.SAC", DIR + "s28/2015336/2015336_00_00_00_s28_BHN.SAC", DIR + "s28/2015336/2015336_00_00_00_s28_BHE.SAC" ] dt = [] if (len(self.GetData(fileName[0])) < 10000): #print(len(self.GetData(fileName[0])<10000)) for fn in range(len(fileName)): ddt = self.GetData(ext[fn])[:10000] cp = len(self.GetData(fileName[fn])) ddt[:cp] = ddt[:cp] + np.zeros( [cp]) #+self.GetData(fileName[fn]) dt.append(ddt) else: for fn in fileName: dt.append(self.GetData(fn)[:500000]) datalen = int((len(dt[0]) - self.wlLagN * self.wlWinN - self.fqWinN) / self.fqLagN / self.wlLagN) file = open(outfile, "w") step = self.fqLagN * self.wlLagN for itr in range(datalen): data = self.STFT(dt, itr * step) data = self.WAVELET(data, 3) data = self.REGU(data) data = self.TRIM(data, self.selmax) try: data = self.FIG(data) except: data = [0, 0] tm = itr * self.sphs file.write("%f," % (tm)) for itr in data: file.write("%d," % itr) file.write("\n") file.close() def GetSta(self, fileName, force=False): if (force == True): self.sta = False if (os.path.exists("stat.out.npz") == True): os.remove("stat.out.npz") if (self.sta == True): return if (os.path.exists("stat.out.npz") == True): data = np.load("stat.out.npz") self.mu = data['a'] self.sigma = data['b'] self.sta = True return dt_for_sta = np.zeros([4000, self.vectLen]) dt = [] for fn in fileName: dt.append(self.GetData(fn)) for idx in range(4000): data = self.STFT(dt, idx * 1000) dt_for_sta[idx, :] = self.WAVELET(data, 3) self.mu = np.average(dt_for_sta, axis=0) self.sigma = np.std(dt_for_sta, axis=0) np.savez("stat.out.npz", a=self.mu, b=self.sigma) self.sta = True def STFT(self, xx, idx): fqWin = int(self.fqWinN / 2) sumx = np.zeros([self.wlWinN, fqWin]) tpx = np.zeros([self.wlWinN, self.fqWinN]) for itx in xx: w = np.hanning(self.fqWinN) tpx[:, :] = np.zeros([self.wlWinN, self.fqWinN]) for ii in range(self.wlWinN): start = ii * self.fqLagN + idx tpx[ii, :] = itx[start:start + self.fqWinN] X = scipy.fft(tpx, axis=1) X = (X * w)[:, :fqWin] X = np.square(np.abs(X)) sumx = np.add(sumx, X) sumx = np.sqrt(sumx) sumx = resample(sumx, self.wlRspN, axis=0) sumx = resample(sumx, self.fqRspN, axis=1) return sumx def WAVELET(self, data, level): outdt = pywt.wavedec2(data, 'haar', level=level) out = np.zeros([self.vectLen]) idx = 0 for itr in outdt: evaldt = np.reshape(itr, [-1]) cit = len(evaldt) out[idx:idx + cit] = evaldt[:] idx += cit out2 = np.sqrt(np.sum(np.square(out))) out = np.divide(out, out2) return out def REGU(self, data): rdata = data - self.mu rdata = np.divide(rdata, self.sigma) return rdata def TRIM(self, data, nlarge): absdata = np.abs(data) large = heapq.nlargest(nlarge, range(len(data)), key=lambda x: absdata[x]) lst = np.zeros_like(data) for itr in large: lst[itr] = data[itr] / absdata[itr] return lst def FIG(self, tr): wm = self.wmg.minhash(tr) # wm1 is of the type WeightedMinHash vl = np.transpose(wm.hashvalues) vl = vl[0] return (vl.tolist())
def create_weighted_minhash(data): global wmg if not wmg: wmg = WeightedMinHashGenerator(len(data), 128, seed=12) minhash = wmg.minhash(data) return minhash
def __init__(self, tarfile_ortho=None, h5_oma=None, taxa=None, masterTree=None, saving_name=None, numperm=256, treeweights=None, taxfilter=None, taxmask=None, verbose=False): if h5_oma is not None: from pyoma.browser import db self.h5OMA = h5_oma self.db_obj = db.Database(h5_oma) self.oma_id_obj = db.OmaIdMapper(self.db_obj) elif tarfile_ortho: self.tar = tarfile_ortho self.h5OMA = None self.db_obj = None self.oma_id_obj = None self.tax_filter = taxfilter self.tax_mask = taxmask self.verbose = verbose self.datetime = datetime self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now()) self.saving_name = saving_name #original_umask = os.umask(0) if saving_name: self.saving_path = config_utils.datadir + saving_name + '/' if not os.path.isdir(self.saving_path): os.mkdir(path=self.saving_path) else: self.saving_path = config_utils.datadir + self.date_string + '/' if not os.path.isdir(self.saving_path): os.mkdir(path=self.saving_path) if masterTree is None: if h5_oma: genomes = pd.DataFrame( h5_oma.root.Genome.read())["NCBITaxonId"].tolist() genomes = [str(g) for g in genomes] taxa = genomes + [131567, 2759, 2157, 45596] + [ taxrel[0] for taxrel in list(h5_oma.root.Taxonomy[:]) ] + [taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:])] self.tree_string, self.tree_ete3 = files_utils.get_tree( taxa=taxa, genomes=genomes, savename=saving_name) elif taxa: with open(taxa, 'r') as taxin: taxlist = [int(line) for line in taxin] self.tree_string, self.tree_ete3 = files_utils.get_tree( taxa=taxlist, savename=saving_name) else: raise Exception( 'please specify either a list of taxa or a tree') elif mastertree: with open(masterTree, 'wb') as pklin: self.tree_ete3 = pickle.loads(pklin.read()) self.tree_string = self.tree_ete3.write(format=1) self.taxaIndex, self.reverse = files_utils.generate_taxa_index( self.tree_ete3, self.tax_filter, self.tax_mask) with open(config_utils.datadir + 'taxaIndex.pkl', 'wb') as taxout: taxout.write(pickle.dumps(self.taxaIndex)) self.numperm = numperm if treeweights is None: #generate aconfig_utilsll ones self.treeweights = hashutils.generate_treeweights( self.tree_ete3, self.taxaIndex, taxfilter, taxmask) else: #load machine learning weights self.treeweights = treeweights print(self.treeweights) wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex), sample_size=numperm, seed=1) with open(self.saving_path + saving_name + 'wmg.pkl', 'wb') as taxout: taxout.write(pickle.dumps(self.taxaIndex)) self.wmg = wmg self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string) self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg) if self.h5OMA: self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj) elif self.tar: self.READ_ORTHO = pyhamutils.get_orthoxml self.hashes_path = self.saving_path + 'hashes.h5' self.lshpath = self.saving_path + 'newlsh.pkl' self.lshforestpath = self.saving_path + 'newlshforest.pkl' self.mat_path = self.saving_path + 'hogmat.h5' self.columns = len(self.taxaIndex)
def calcuJaccard(hashSig1, hashSig2): return np.float(np.count_nonzero(hashSig1 == hashSig2)) /\ np.float(len(hashSig1)) if __name__ == '__main__': data1 = [ 'TAG', 'DSA', 'FDV', 'GFG', 'TRE', 'EWE', 'QWW', 'RFR', 'QWE', 'ZAW', 'CDS', 'NBH' ] data2 = [ 'TAG', 'DSA', 'FDV', 'GFG', 'TRE', 'EWE', 'QWW', 'RFR', 'SDA', 'ZAW', 'CDS', 'NBH' ] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) data3 = [2, 0, 0, 0, 0, 3, 0, 0, 2, 0, 2, 0, 0] data4 = [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0] weight = WeightedMinHashGenerator(len(data3)) w1 = weight.minhash(data3) w2 = weight.minhash(data4) print("Weighted Jac ", w1.jaccard(w2))
class SacFig(): def GetData(self, file): st = pread(file) return st[0].data def __init__(self, fileName): self.wlWinN = 200 self.wlLagN = 10 self.fqWinN = 200 self.fqLagN = 10 self.fqRspN = 32 self.wlRspN = 32 self.selmax = 50 self.wl_x_level = 3 self.vectLen = self.fqRspN * self.wlRspN self.wmg = WeightedMinHashGenerator(self.vectLen, sample_size=2, seed=12) self.sta = False self.sphs = self.fqLagN * self.wlLagN / 100 self.GetSta(fileName, force=True) for fn in fileName: self.dt = self.GetData(fileName)[:500000] self.datalen = int( (len(self.dt) - self.wlLagN * self.wlWinN - self.fqWinN) / self.fqLagN / self.wlLagN) #tempdata=self.GetData("D:/Weiyuan/templates/2015318092941.s15.z") tpd = np.load("template.npz") tempdata = tpd['c'] for itrx in range(6): itr = np.random.randint(100) start = itrx * 50000 + 127 end = start + len(tempdata[itr]) #print(start,end,itr,np.shape(tempdata[itr]),np.shape(self.dt[start:end])) self.dt[start:end] = self.dt[start:end] + tempdata[itr] * 0.1 x = np.linspace(0, 5000, len(self.dt)) plt.plot(x, self.dt) plt.show() def GetHash(self): self.hash = [] step = self.fqLagN * self.wlLagN for itr in range(self.datalen): data = self.STFT(self.dt, itr * step) data = self.WAVELET(data, self.wl_x_level) data = self.REGU(data) data = self.TRIM(data, self.selmax) data = self.FIG(data) self.hash.append(data) return self.hash def GetHashTofile(self, file): self.hash = [] step = self.fqLagN * self.wlLagN for itr in range(self.datalen): data = self.STFT(self.dt, itr * step) data = self.WAVELET(data, 3) data = self.REGU(data) data = self.TRIM(data, self.selmax) data = self.FIG(data) self.hash.append(data) tm = itr * self.sphs file.write("%f," % (tm)) for itr in data: file.write("%d," % itr) file.write("\n") def GetSta(self, fileName, force=False): if (force == True): self.sta = False os.remove("stat.out.npz") if (self.sta == True): return if (os.path.exists("stat.out.npz") == True): data = np.load("stat.out.npz") self.mu = data['a'] self.sigma = data['b'] self.sta = True return dt_for_sta = np.zeros([4000, self.vectLen]) dta = self.GetData(fileName) for idx in range(4000): dt = self.STFT(dta, idx * 1000) dt_for_sta[idx, :] = self.WAVELET(dt, 3) self.mu = np.average(dt_for_sta, axis=0) self.sigma = np.std(dt_for_sta, axis=0) np.savez("stat.out.npz", a=self.mu, b=self.sigma) self.sta = True def STFT(self, x, idx): w = np.hanning(self.fqWinN) fqWin = int(self.fqWinN / 2) X = np.zeros([self.wlWinN, self.fqWinN]) for ii in range(self.wlWinN): start = ii * self.fqLagN + idx X[ii, :] = x[start:start + self.fqWinN] * w X = scipy.fft(X, axis=1)[:fqWin] X = np.abs(np.array(X)) X = resample(X, self.wlRspN, axis=0) X = resample(X, self.fqRspN, axis=1) return X def WAVELET(self, data, level): outdt = pywt.wavedec2(data, 'haar', level=level) out = np.zeros([self.vectLen]) idx = 0 for itr in outdt: evaldt = np.reshape(itr, [-1]) cit = len(evaldt) out[idx:idx + cit] = evaldt[:] idx += cit out2 = np.sqrt(np.sum(np.square(out))) out = np.divide(out, out2) return out def REGU(self, data): rdata = data - self.mu rdata = np.divide(rdata, self.sigma) return rdata def TRIM(self, data, nlarge): absdata = np.abs(data) large = heapq.nlargest(nlarge, range(len(data)), key=lambda x: absdata[x]) lst = np.zeros_like(data) for itr in large: lst[itr] = data[itr] / absdata[itr] return lst def FIG(self, tr): wm = self.wmg.minhash(tr) # wm1 is of the type WeightedMinHash vl = np.transpose(wm.hashvalues) vl = vl[0] return (vl.tolist())