Python MinHashの例、datasketch.minhash.MinHash Pythonの例

コード例 #1

0

ファイルを表示

ファイル: minhash_benchmark.py プロジェクト: GitManager/datasketch

def run_perf(card, num_perm):
    m = MinHash(num_perm=num_perm)
    logging.info("MinHash using %d permutation functions" % num_perm)
    start = time.clock()
    for i in range(card):
        m.digest(sha1(int_bytes(i)))
    duration = time.clock() - start 
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration

コード例 #2

0

ファイルを表示

ファイル: minhash_benchmark.py プロジェクト: GitManager/datasketch

def _run_acc(size, seed, num_perm):
    m = MinHash(num_perm=num_perm)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        m.digest(sha1(v))
        s.add(v)
    return (m, s)

コード例 #3

0

ファイルを表示

ファイル: wd_minhash.py プロジェクト: isendel/wd_matcher

def dict_to_minhash(v):
    """
    Generates a Minhash for a dict object
    :param v: dictionary
    :return: minhash
    """
    m_ = MinHash()
    tokens = my_tokenizer(v)
    for t in tokens:
        m_.digest(sha1(t.encode('utf8')))
    return m_

コード例 #4

0

ファイルを表示

ファイル: similarity.py プロジェクト: Onager/timesketch

def minhash_from_text(text, num_perm, delimiters):
    """Calculate minhash of text.

    Args:
        text: string to calculate minhash of.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        delimiters: list of strings used as delimiters for splitting text
            into words.

    Returns:
        A minhash (instance of datasketch.minhash.MinHash)
    """
    minhash = MinHash(num_perm)
    for word in _shingles_from_text(text, delimiters):
        minhash.update(word.encode('utf8'))
    return minhash

コード例 #5

0

ファイルを表示

ファイル: lsh_examples.py プロジェクト: shuimu/datasketch

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    for d in data3:
        m3.digest(sha1(d.encode('utf8')))

    # Create LSH index
    lsh = LSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)

コード例 #6

0

ファイルを表示

ファイル: similarity_benchmark.py プロジェクト: GitManager/datasketch

def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]

コード例 #7

0

ファイルを表示

ファイル: lshforest_test.py プロジェクト: LiuFang816/SALSTM_py_data

 def test_pickle(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest.query(m1, 1)
     self.assertTrue("a" in result)
     result = forest.query(m2, 1)
     self.assertTrue("b" in result)

コード例 #8

0

ファイルを表示

ファイル: minhash_examples.py プロジェクト: xsongx/datasketch

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.digest(sha1(d.encode('utf8')))
    for d in data2:
        m2.digest(sha1(d.encode('utf8')))
    print("Estimated Jaccard for data1 and data2 is", jaccard([m1, m2]))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)

コード例 #9

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: zjiaksmc/datasketch

 async def test_get_counts_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.5,
                                num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         await lsh.insert("a", m1)
         await lsh.insert("b", m2)
         counts = await lsh.get_counts()
         self.assertEqual(len(counts), lsh.b)
         for table in counts:
             self.assertEqual(sum(table.values()), 2)

コード例 #10

0

ファイルを表示

ファイル: test_lsh_mongo.py プロジェクト: sangyongjia/datasketch

    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
            await lsh2.close()

コード例 #11

0

ファイルを表示

    def test_query(self):
        m1 = MinHash()
        m1.update("a".encode("utf8"))
        m1.update("b".encode("utf8"))
        m1.update("c".encode("utf8"))
        forest = self._setup()
        result = forest.query(m1, 3)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)
        self.assertTrue("c" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, forest.query, m3, 1)

コード例 #12

0

ファイルを表示

ファイル: phishy_domains.py プロジェクト: google/timesketch

    def _get_minhash_from_domain(domain):
        """Get the Minhash value from a domain name.

        This function takes a domain, removes the TLD extension
        from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash

コード例 #13

0

ファイルを表示

ファイル: phishy_domains.py プロジェクト: moscalej/timesketch

    def _get_minhash_from_domain(domain):
        """Get the Minhash value from a domain name.

        This function takes a domain, removes the TLD extension
        from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash

コード例 #14

0

ファイルを表示

    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")

コード例 #15

0

ファイルを表示

ファイル: inclusion_benchmark.py プロジェクト: shuimu/datasketch

def _run_minhash(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=2**p)
    m2 = MinHash(num_perm=2**p)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return _minhash_inclusion(m1, m2)

コード例 #16

0

ファイルを表示

ファイル: minhash_examples.py プロジェクト: Amano-Ginji/datasketch

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

    s1 = set(data1)
    s2 = set(data2)
    actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
    print("Actual Jaccard for data1 and data2 is", actual_jaccard)

コード例 #17

0

ファイルを表示

 def __init__(self, malware, num_pcap, filename=None):
     """
     :param filename: reads file dumped with dump_data (default: None)
     """
     self.num_pcap = num_pcap
     self.malware = malware
     self.session_collection = {}
     self.malware_feature_list = list()
     if filename is not None:
         dump_file = open(filename, 'r')
         data = dump_file.read()
         data = json.loads(data)
         for key in data:
             hash_val = np.asarray(data[key][5], dtype=np.uint64)
             data[key][_MINHASH_INDEX] = MinHash()
             data[key][_MINHASH_INDEX] = hash_val

コード例 #18

0

ファイルを表示

    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)

コード例 #19

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch

 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)

コード例 #20

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: zjiaksmc/datasketch

    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")

コード例 #21

0

ファイルを表示

    def test_query_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            result = lsh.query(m1)
            self.assertTrue("a" in result)
            result = lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.query, m3)

コード例 #22

0

ファイルを表示

 def test_add_index(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     self.assertTrue(forest.is_empty())
     for t in forest.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in forest)
     self.assertTrue("b" in forest)
     for i, H in enumerate(forest.keys["a"]):
         self.assertTrue("a" in forest.hashtables[i][H])
     m3 = MinHash(18)
     self.assertRaises(ValueError, forest.add, "c", m3)
     forest.index()
     self.assertFalse(forest.is_empty())

コード例 #23

0

ファイルを表示

 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])

コード例 #24

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch

    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)

コード例 #25

0

ファイルを表示

ファイル: test_lsh_mongo.py プロジェクト: sangyongjia/datasketch

    async def test_remove_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]
        keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')
        keys_left = frozenset(seq) - frozenset(keys_to_remove)

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            async with lsh.delete_session(batch_size=3) as session:
                fs = (session.remove(key) for key in keys_to_remove)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                for key in keys_to_remove:
                    self.assertTrue(key not in items, '{0} in items, but should not be'.format(key))
                for key in keys_left:
                    self.assertTrue(key in items, '{0} not in items, but should be'.format(key))

            for key in keys_to_remove:
                self.assertTrue(not (await lsh.has_key(key)), '<{0}> key should not be in LSH index'.format(key))
            for key in keys_left:
                self.assertTrue(await lsh.has_key(key), '<{0}> key should be in LSH index'.format(key))

コード例 #26

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch

    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        
        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")

コード例 #27

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: zjiaksmc/datasketch

 async def test_insertion_session_redis(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                threshold=0.5,
                                num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         data = [("a", m1), ("b", m2)]
         async with lsh.insertion_session() as session:
             for key, minhash in data:
                 await session.insert(key, minhash)
         for t in lsh.hashtables:
             self.assertTrue(await t.size() >= 1)
             items = []
             for H in await t.keys():
                 items.extend(await t.get(H))
             self.assertTrue(pickle.dumps("a") in items)
             self.assertTrue(pickle.dumps("b") in items)
         self.assertTrue(await lsh.has_key("a"))
         self.assertTrue(await lsh.has_key("b"))
         for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
             res = await lsh.hashtables[i].get(H)
             self.assertTrue(pickle.dumps("a") in res)

コード例 #28

0

ファイルを表示

    async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk(
            (random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(
            chain((''.join(s) for s in _chunked_str),
                  ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                   'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False)
                      for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))

コード例 #29

0

ファイルを表示

ファイル: lsh_test.py プロジェクト: cytora/datasketch

    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)

コード例 #30

0

ファイルを表示

    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("a".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            await lsh.insert("a1", m3)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key('a1'))
            hashtable_correct = False
            for table in lsh.hashtables:
                for H in await table.keys():
                    table_vals = await table.get(H)
                    self.assertGreater(len(table_vals), 0)
                    self.assertTrue("a" not in table_vals)
                    if 'a1' in table_vals:
                        hashtable_correct = True
            self.assertTrue(hashtable_correct, 'Hashtable broken')

            with self.assertRaises(ValueError):
                await lsh.remove("c")

コード例 #31

0

ファイルを表示

    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("b".encode("utf8"))
            fs = (lsh.insert("a", m1, check_duplication=False),
                  lsh.insert("b", m2, check_duplication=False),
                  lsh.insert("b", m3, check_duplication=False))
            await asyncio.gather(*fs)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.query(m3)

コード例 #32

0

ファイルを表示

    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)

if __name__ == "__main__":
    eg1()
    eg2()

>>> 
>>> from datasketch import MinHash

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

m1, m2 = MinHash(), MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
>>> 
>>> m = MinHash(num_perm=256)
>>> m.count()
0.0
>>> from sparselsh import LSH

コード例 #33

0

ファイルを表示

from datasketch.lsh import MinHashLSH
from preprocess import tokenize_sentence
"""
To find similar questions in O(1) we are using jaccard similarity and minHash. 
Question with similar minHash are candidates to be similar. 
To compare if two candidate senteces are similar we are using jaccard similarity  
"""

df = pd.read_csv("proccessed.csv")
total_questions = df.shape[0]
threshold_jacard = 0.30
lsh = MinHashLSH(threshold=threshold_jacard)

#calculate minhash for each sentence in column question1
for index, row in df.iterrows():
    min_Hash = MinHash()
    question = tokenize_sentence(str(row['question1']))
    for word in question:
        min_Hash.update(word.encode('utf8'))
    lsh.insert(str(index), min_Hash)

total = 0
return_result = 0
correct = 0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))
    for word in question:
        question_minHash.update(word.encode('utf8'))

コード例 #34

0

ファイルを表示

ファイル: cardinality_benchmark.py プロジェクト: zjiaksmc/datasketch

def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p, hashobj=Hash)
    for d in data:
        m.update(hasher(d, seed=seed))
    return m.count()

コード例 #35

0

ファイルを表示

ファイル: minhash.py プロジェクト: Avitalg/ML---Final-Project

" they were al anr oo the bark of the bark of the boos of the boos of the boos of the boos",
" afd the nererland thet thet whre hev back on the barkn of the bors.",
" and they were al anr oo the bark of the bark of the boos of the boos of the boos of the boos afd the nererland thet",
" thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark of",
" the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the ",
" barkn of the bors. and they were al anr oo the bark of the bark of the boos of the boos of the boos of the",
" boos afd the nererland thet thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark ",
" of the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the barkn of the bors."
]

array=[0]*len(seq)
for i in range (len (dataX)):   
   for j  in range(len(seq)):
        data1=dataX[i].split()
        data2=seq[j].split()
        m1 = MinHash()
        m2 = MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2))) /\
            float(len(s1.union(s2)))
        if array[j]<actual_jaccard:
            array[j]=actual_jaccard
            print array[j]


print array

コード例 #36

0

ファイルを表示

def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)

コード例 #37

0

ファイルを表示

def mh(dd):
    m = MinHash()
    for d in dd:
        m.update(d.encode('utf8'))
    return m

コード例 #38

0

ファイルを表示

ファイル: cardinality_benchmark.py プロジェクト: GitManager/datasketch

def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p)
    for d in data:
        m.digest(Hash(hasher(d, seed=seed)))
    return m.count()

コード例 #39

0

ファイルを表示

ファイル: lsh_my_example.py プロジェクト: GregoryDS/DataScienceProject

 def min_hash_text(self, sm_text):
     m = MinHash()
     for d in sm_text:
         m.update(d.encode('utf8'))
     return m