Esempio n. 1
0
def owner_cluster(con,cur,nitem=None,reverse=True,nshingle=2,store=False,**kwargs):
    c = sh.Cluster(**kwargs)

    cmd = 'select ownerid,name from owner'
    if reverse:
        cmd += ' order by rowid desc'
    if nitem:
        cmd += ' limit %i' % nitem

    name_dict = {}
    for (i,(ownerid,name)) in enumerate(cur.execute(cmd)):
        words = name.split()
        shings = list(sh.shingle(name,nshingle))

        features = shings + words
        weights = list(np.linspace(1.0,0.0,len(shings))) + list(np.linspace(1.0,0.0,len(words)))

        c.add(features,weights=weights,label=ownerid)
        name_dict[ownerid] = name

        if i%10000 == 0:
            print(i)

    ipairs = c.unions
    npairs = [(name_dict[i1],name_dict[i2]) for (i1,i2) in ipairs]
    print('Found %i pairs' % len(ipairs))

    if store:
        cur.execute('drop table if exists pair')
        cur.execute('create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)')
        cur.executemany('insert into pair values (?,?,?,?)',[(o1,o2,n1,n2) for ((o1,o2),(n1,n2)) in zip(ipairs,npairs)])
        con.commit()
    else:
        return (ipairs,npairs)
Esempio n. 2
0
def compute_simhash(text):
    tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE)
    shingles = [
        ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4)
    ]
    hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles]
    return simhash.compute(hashes)
Esempio n. 3
0
def tokens_to_fingerprint(tokens):
    shingles = [
        ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 8)
    ]
    hashes = [
        simhash.unsigned_hash(s.encode('utf8', 'ignore')) for s in shingles
    ]
    return simhash.compute(hashes)
Esempio n. 4
0
 def compute(self, data, token_size):
     content = data.lower()
     content = ''.join(self.filter.findall(content))
     shingles = [
         ''.join(_shingle) for _shingle in shingle(content, token_size)
     ]
     hashes = [
         unsigned_hash(s.encode("utf-8")) for s in sorted(shingles)
     ]
     return compute(hashes)
Esempio n. 5
0
def hash_text(t):
    text, domain, year, season = t
    tokens = nltk.tokenize.word_tokenize(text)

    #https://github.com/seomoz/simhash-py/issues/47
    # A generator for ' '-joined strings of consecutive tokens
    shingles = (' '.join(shingle) for shingle in simhash.shingle(tokens, 4))
    # They need to be unsigned 64-bit ints
    h= simhash.compute([ctypes.c_ulong(hash(shingle)).value for shingle in shingles])

    return (h, text, domain, year, season)
Esempio n. 6
0
def compute(text):
    """
        compute hash for a document by shingles
    """

    tokens = text.split()
    phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))

    hashes = map(simhash.unsigned_hash, phrases)

    return simhash.compute(hashes)
Esempio n. 7
0
 def test_basic(self):
     tokens = list(range(10))
     expected = [
         [0, 1, 2, 3],
         [1, 2, 3, 4],
         [2, 3, 4, 5],
         [3, 4, 5, 6],
         [4, 5, 6, 7],
         [5, 6, 7, 8],
         [6, 7, 8, 9]
     ]
     self.assertEqual(expected, list(simhash.shingle(tokens, 4)))
Esempio n. 8
0
def compute(text):
    """
        compute hash for a document by shingles
    """
    #tokens = re.split(r'\W+', text)
    tokens = text.split()  # 默认以‘ ’来进行切割,得到分词的列表
    #logger.debug('%s', ''.join(tokens[:5]))

    phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))
    #logger.debug('%s', [x for x in phrases])
    hashes = map(simhash.unsigned_hash, phrases)
    return simhash.compute(hashes)
Esempio n. 9
0
def owner_cluster(con,
                  cur,
                  nitem=None,
                  reverse=True,
                  nshingle=2,
                  store=True,
                  **kwargs):
    print('generating hashes and pairs')

    c = sh.Cluster(**kwargs)

    cmd = 'select ownerid,name from owner'
    if reverse:
        cmd += ' order by rowid desc'
    if nitem:
        cmd += ' limit %i' % nitem

    name_dict = {}
    for (i, (ownerid, name)) in enumerate(cur.execute(cmd)):
        words = name.split()
        shings = list(sh.shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(
            np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=ownerid)
        name_dict[ownerid] = name

        if i % 10000 == 0:
            print(i)

    ipairs = c.unions
    npairs = [(name_dict[i1], name_dict[i2]) for (i1, i2) in ipairs]
    print('Found %i pairs' % len(ipairs))

    if store:
        cur.execute('drop table if exists pair')
        cur.execute(
            'create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)'
        )
        cur.executemany('insert into pair values (?,?,?,?)',
                        [(o1, o2, n1, n2)
                         for ((o1, o2), (n1, n2)) in zip(ipairs, npairs)])
        con.commit()
    else:
        return (ipairs, npairs)
Esempio n. 10
0
def filter_pairs(con, nshingle=2, k=8, thresh=4):
    print('filtering pairs')

    c = sh.Cluster(k=k, thresh=thresh)
    name_dict = {}

    names = pd.read_sql('select id,name from name', con)
    for i, id, name in names.itertuples():
        words = name.split()
        shings = list(sh.shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(
            np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=id)
        name_dict[id] = name

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(c.unions)}')
Esempio n. 11
0
 def compute(self, text):
     tokens = re.split(r'\W+', text, flags=re.UNICODE)
     phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))
     hashes = map(simhash.unsigned_hash, phrases)
     return simhash.compute(hashes)
Esempio n. 12
0
 def compute(self, text):
     tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE)
     shingles = [''.join(shingle) for shingle in
                 simhash.shingle(''.join(tokens), 4)]
     hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles]
     return simhash.compute(hashes)
Esempio n. 13
0
 def test_negative_window_size(self):
     tokens = list(range(10))
     with self.assertRaises(ValueError):
         list(simhash.shingle(tokens, -1))
Esempio n. 14
0
 def test_fewer_than_window(self):
     tokens = list(range(3))
     self.assertEqual([], list(simhash.shingle(tokens, 4)))
Esempio n. 15
0
 def __shingle(token, shingle_size):
     return (' '.join(tokens)
             for tokens in simhash.shingle(token, shingle_size))
Esempio n. 16
0
 def test_zero_window_size(self):
     tokens = range(10)
     with self.assertRaises(ValueError):
         list(simhash.shingle(tokens, 0))
Esempio n. 17
0
 def test_basic(self):
     tokens = list(range(10))
     expected = [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6],
                 [4, 5, 6, 7], [5, 6, 7, 8], [6, 7, 8, 9]]
     self.assertEqual(expected, list(simhash.shingle(tokens, 4)))
Esempio n. 18
0
 def test_negative_window_size(self):
     tokens = list(range(10))
     with self.assertRaises(ValueError):
         list(simhash.shingle(tokens, -1))
Esempio n. 19
0
 def test_fewer_than_window(self):
     tokens = list(range(3))
     self.assertEqual([], list(simhash.shingle(tokens, 4)))