Esempio n. 1
0
    async def test_insert_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue("a" in items)
                self.assertTrue("b" in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get("a")):
                self.assertTrue("a" in await lsh.hashtables[i].get(H))

            mg = WeightedMinHashGenerator(10, 5)
            m3 = mg.minhash(np.random.uniform(1, 10, 10))
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
Esempio n. 2
0
def eg2():
    mg = WeightedMinHashGenerator(10, 5)
    m1 = mg.minhash(v1)
    m2 = mg.minhash(v2)
    m3 = mg.minhash(v3)
    print("Estimated Jaccard m1, m2", m1.jaccard(m2))
    print("Estimated Jaccard m1, m3", m1.jaccard(m3))
    # Create LSH index
    lsh = WeightedMinHashLSH(threshold=0.1, sample_size=5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)
Esempio n. 3
0
def eg2():
    mg = WeightedMinHashGenerator(10, 5)
    m1 = mg.minhash(v1)
    m2 = mg.minhash(v2)
    m3 = mg.minhash(v3)
    print("Estimated Jaccard m1, m2", m1.jaccard(m2))
    print("Estimated Jaccard m1, m3", m1.jaccard(m3))
    # Create LSH index
    lsh = WeightedMinHashLSH(threshold=0.1, sample_size=5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)
Esempio n. 4
0
    def test_query(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.query, m3)
Esempio n. 5
0
    def test_query(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.query, m3)
Esempio n. 6
0
 def test_minhash(self):
     mg = WeightedMinHashGenerator(2, 4, 1)
     m = mg.minhash([1, 3])
     self.assertIsInstance(m, WeightedMinHash)
     self.assertEqual(len(m.hashvalues), 4)
     self.assertEqual(len(m), 4)
     self.assertTrue(m.hashvalues.dtype == np.int)
 def test_minhash(self):
     mg = WeightedMinHashGenerator(2, 4, 1)
     m = mg.minhash([1,3])
     self.assertIsInstance(m, WeightedMinHash)
     self.assertEqual(len(m.hashvalues), 4)
     self.assertEqual(len(m), 4)
     self.assertTrue(m.hashvalues.dtype == np.int)
Esempio n. 8
0
    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            mg = WeightedMinHashGenerator(10, 5)
            m3 = mg.minhash(np.random.uniform(1, 10, 10))

            with self.assertRaises(ValueError):
                await lsh.query(m3)
Esempio n. 9
0
    def test_insert(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Esempio n. 10
0
 def test_pickle(self):
     lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
     mg = WeightedMinHashGenerator(10, 4)
     m1 = mg.minhash(np.random.uniform(1, 10, 10))
     m2 = mg.minhash(np.random.uniform(1, 10, 10))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Esempio n. 11
0
 def test_pickle(self):
     lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
     mg = WeightedMinHashGenerator(10, 4)
     m1 = mg.minhash(np.random.uniform(1, 10, 10))
     m2 = mg.minhash(np.random.uniform(1, 10, 10))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
Esempio n. 12
0
 def test__H(self):
     '''
     Check _H output consistent bytes length given
     the same concatenated hash value size
     '''
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size + 1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         lsh = MinHashLSH(num_perm=128)
         lsh.insert("m", m)
         sizes = [len(H) for ht in lsh.hashtables for H in ht]
         self.assertTrue(all(sizes[0] == s for s in sizes))
Esempio n. 13
0
    def test_pickle(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)
Esempio n. 14
0
    def test_insert(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.insert, "c", m3)
Esempio n. 15
0
 async def test__H_mongo(self):
     """
     Check _H output consistent bytes length given
     the same concatenated hash value size
     """
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size + 1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                    num_perm=128) as lsh:
             await lsh.insert("m", m)
             fs = (ht.keys() for ht in lsh.hashtables)
             hashtables = await asyncio.gather(*fs)
             sizes = [len(H) for H in hashtables]
             self.assertTrue(all(sizes[0] == s for s in sizes))
Esempio n. 16
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
Esempio n. 17
0
    def test_remove(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Esempio n. 18
0
    def test_remove(self):
        lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        
        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
Esempio n. 19
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")
Esempio n. 20
0
 def test_pickle(self):
     mg = WeightedMinHashGenerator(4, 10, 1)
     m = mg.minhash([1, 2, 3, 4])
     p = pickle.loads(pickle.dumps(m))
     self.assertEqual(p.seed, m.seed)
     self.assertTrue(np.array_equal(p.hashvalues, m.hashvalues))
 def test_pickle(self):
     mg = WeightedMinHashGenerator(4, 10, 1)
     m = mg.minhash([1,2,3,4])
     p = pickle.loads(pickle.dumps(m))
     self.assertEqual(p.seed, m.seed)
     self.assertTrue(np.array_equal(p.hashvalues, m.hashvalues))