def owner_cluster(con,cur,nitem=None,reverse=True,nshingle=2,store=False,**kwargs): c = sh.Cluster(**kwargs) cmd = 'select ownerid,name from owner' if reverse: cmd += ' order by rowid desc' if nitem: cmd += ' limit %i' % nitem name_dict = {} for (i,(ownerid,name)) in enumerate(cur.execute(cmd)): words = name.split() shings = list(sh.shingle(name,nshingle)) features = shings + words weights = list(np.linspace(1.0,0.0,len(shings))) + list(np.linspace(1.0,0.0,len(words))) c.add(features,weights=weights,label=ownerid) name_dict[ownerid] = name if i%10000 == 0: print(i) ipairs = c.unions npairs = [(name_dict[i1],name_dict[i2]) for (i1,i2) in ipairs] print('Found %i pairs' % len(ipairs)) if store: cur.execute('drop table if exists pair') cur.execute('create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)') cur.executemany('insert into pair values (?,?,?,?)',[(o1,o2,n1,n2) for ((o1,o2),(n1,n2)) in zip(ipairs,npairs)]) con.commit() else: return (ipairs,npairs)
def compute_simhash(text): tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE) shingles = [ ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4) ] hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles] return simhash.compute(hashes)
def tokens_to_fingerprint(tokens): shingles = [ ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 8) ] hashes = [ simhash.unsigned_hash(s.encode('utf8', 'ignore')) for s in shingles ] return simhash.compute(hashes)
def compute(self, data, token_size): content = data.lower() content = ''.join(self.filter.findall(content)) shingles = [ ''.join(_shingle) for _shingle in shingle(content, token_size) ] hashes = [ unsigned_hash(s.encode("utf-8")) for s in sorted(shingles) ] return compute(hashes)
def hash_text(t): text, domain, year, season = t tokens = nltk.tokenize.word_tokenize(text) #https://github.com/seomoz/simhash-py/issues/47 # A generator for ' '-joined strings of consecutive tokens shingles = (' '.join(shingle) for shingle in simhash.shingle(tokens, 4)) # They need to be unsigned 64-bit ints h= simhash.compute([ctypes.c_ulong(hash(shingle)).value for shingle in shingles]) return (h, text, domain, year, season)
def compute(text): """ compute hash for a document by shingles """ tokens = text.split() phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)
def test_basic(self): tokens = list(range(10)) expected = [ [0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [4, 5, 6, 7], [5, 6, 7, 8], [6, 7, 8, 9] ] self.assertEqual(expected, list(simhash.shingle(tokens, 4)))
def compute(text): """ compute hash for a document by shingles """ #tokens = re.split(r'\W+', text) tokens = text.split() # 默认以‘ ’来进行切割,得到分词的列表 #logger.debug('%s', ''.join(tokens[:5])) phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) #logger.debug('%s', [x for x in phrases]) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)
def owner_cluster(con, cur, nitem=None, reverse=True, nshingle=2, store=True, **kwargs): print('generating hashes and pairs') c = sh.Cluster(**kwargs) cmd = 'select ownerid,name from owner' if reverse: cmd += ' order by rowid desc' if nitem: cmd += ' limit %i' % nitem name_dict = {} for (i, (ownerid, name)) in enumerate(cur.execute(cmd)): words = name.split() shings = list(sh.shingle(name, nshingle)) features = shings + words weights = list(np.linspace(1.0, 0.0, len(shings))) + list( np.linspace(1.0, 0.0, len(words))) c.add(features, weights=weights, label=ownerid) name_dict[ownerid] = name if i % 10000 == 0: print(i) ipairs = c.unions npairs = [(name_dict[i1], name_dict[i2]) for (i1, i2) in ipairs] print('Found %i pairs' % len(ipairs)) if store: cur.execute('drop table if exists pair') cur.execute( 'create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)' ) cur.executemany('insert into pair values (?,?,?,?)', [(o1, o2, n1, n2) for ((o1, o2), (n1, n2)) in zip(ipairs, npairs)]) con.commit() else: return (ipairs, npairs)
def filter_pairs(con, nshingle=2, k=8, thresh=4): print('filtering pairs') c = sh.Cluster(k=k, thresh=thresh) name_dict = {} names = pd.read_sql('select id,name from name', con) for i, id, name in names.itertuples(): words = name.split() shings = list(sh.shingle(name, nshingle)) features = shings + words weights = list(np.linspace(1.0, 0.0, len(shings))) + list( np.linspace(1.0, 0.0, len(words))) c.add(features, weights=weights, label=id) name_dict[id] = name if i > 0 and i % 100_000 == 0: print(f'{i}: {len(c.unions)}')
def compute(self, text): tokens = re.split(r'\W+', text, flags=re.UNICODE) phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)
def compute(self, text): tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE) shingles = [''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4)] hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles] return simhash.compute(hashes)
def test_negative_window_size(self): tokens = list(range(10)) with self.assertRaises(ValueError): list(simhash.shingle(tokens, -1))
def test_fewer_than_window(self): tokens = list(range(3)) self.assertEqual([], list(simhash.shingle(tokens, 4)))
def __shingle(token, shingle_size): return (' '.join(tokens) for tokens in simhash.shingle(token, shingle_size))
def test_zero_window_size(self): tokens = range(10) with self.assertRaises(ValueError): list(simhash.shingle(tokens, 0))
def test_basic(self): tokens = list(range(10)) expected = [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [4, 5, 6, 7], [5, 6, 7, 8], [6, 7, 8, 9]] self.assertEqual(expected, list(simhash.shingle(tokens, 4)))