Ejemplo n.º 1
0
 async def test_init_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.8) as lsh:
         self.assertTrue(await lsh.is_empty())
         b1, r1 = lsh.b, lsh.r
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.8, weights=(0.2, 0.8)) as lsh:
         b2, r2 = lsh.b, lsh.r
     self.assertTrue(b1 < b2)
     self.assertTrue(r1 > r2)
Ejemplo n.º 2
0
    async def test_insert_redis(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue(pickle.dumps("a") in items)
                self.assertTrue(pickle.dumps("b") in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
                res = await lsh.hashtables[i].get(H)
                self.assertTrue(pickle.dumps("a") in res)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
Ejemplo n.º 3
0
    async def test_insert_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            seq = [
                'aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                'ppi', 'eer'
            ]
            objs = [MinHash(16) for _ in range(len(seq))]
            for e, obj in zip(seq, objs):
                for i in e:
                    obj.update(i.encode('utf-8'))

            data = [(e, m) for e, m in zip(seq, objs)]
            for key, minhash in data:
                await lsh.insert(key, minhash)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahh' in items)
                self.assertTrue('bhg' in items)
            self.assertTrue(await lsh.has_key('aahh'))
            self.assertTrue(await lsh.has_key('bhg'))
            for i, H in enumerate(await lsh.keys.get('aahhb')):
                self.assertTrue('aahhb' in await lsh.hashtables[i].get(H))

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
Ejemplo n.º 4
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("a".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            await lsh.insert("a1", m3)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key('a1'))
            hashtable_correct = False
            for table in lsh.hashtables:
                for H in await table.keys():
                    table_vals = await table.get(H)
                    self.assertGreater(len(table_vals), 0)
                    self.assertTrue("a" not in table_vals)
                    if 'a1' in table_vals:
                        hashtable_correct = True
            self.assertTrue(hashtable_correct, 'Hashtable broken')

            with self.assertRaises(ValueError):
                await lsh.remove("c")
Ejemplo n.º 5
0
    async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
Ejemplo n.º 6
0
    async def test_insert_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue("a" in items)
                self.assertTrue("b" in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get("a")):
                self.assertTrue("a" in await lsh.hashtables[i].get(H))

            mg = WeightedMinHashGenerator(10, 5)
            m3 = mg.minhash(np.random.uniform(1, 10, 10))
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
Ejemplo n.º 7
0
Archivo: lsh.py Proyecto: Fatalll/KBQA
async def queries():
    async with AsyncMinHashLSH(threshold=0.01,
                               num_perm=256,
                               storage_config={
                                   'type': 'aiomongo',
                                   'basename': 'k'.encode('utf8'),
                                   'mongo': {
                                       'host': 'localhost',
                                       'port': 27017,
                                       'db': 'lsh'
                                   }
                               }) as lsh:

        async with lsh.insertion_session(batch_size=1000) as session:
            pool = Pool(6)
            keys = next_key()

            res = [
                x
                for x in pool.map(next_real_key, itertools.islice(keys, 1000))
            ]
            while res:
                fs = (session.insert(key, minhash, check_duplication=False)
                      for key, minhash in res)
                res = [
                    x for x in pool.map(next_real_key,
                                        itertools.islice(keys, 1000))
                ]
                await asyncio.gather(*fs)

            pool.close()
            pool.join()
Ejemplo n.º 8
0
    async def test_remove_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk(
            (random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(
            chain((''.join(s) for s in _chunked_str),
                  ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow',
                   'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]
        keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd',
                          'yow', 'ppi', 'eer')
        keys_left = frozenset(seq) - frozenset(keys_to_remove)

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False)
                      for key, minhash in data)
                await asyncio.gather(*fs)

            async with lsh.delete_session(batch_size=3) as session:
                fs = (session.remove(key) for key in keys_to_remove)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                for key in keys_to_remove:
                    self.assertTrue(
                        key not in items,
                        '{0} in items, but should not be'.format(key))
                for key in keys_left:
                    self.assertTrue(
                        key in items,
                        '{0} not in items, but should be'.format(key))

            for key in keys_to_remove:
                self.assertTrue(
                    not (await lsh.has_key(key)),
                    '<{0}> key should not be in LSH index'.format(key))
            for key in keys_left:
                self.assertTrue(await lsh.has_key(key),
                                '<{0}> key should be in LSH index'.format(key))
Ejemplo n.º 9
0
 async def test_get_counts_mongo(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                threshold=0.5, num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         await lsh.insert("a", m1)
         await lsh.insert("b", m2)
         counts = await lsh.get_counts()
         self.assertEqual(len(counts), lsh.b)
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
Ejemplo n.º 10
0
 async def test__H_mongo(self):
     """
     Check _H output consistent bytes length given
     the same concatenated hash value size
     """
     mg = WeightedMinHashGenerator(100, sample_size=128)
     for l in range(2, mg.sample_size + 1, 16):
         m = mg.minhash(np.random.randint(1, 99999999, 100))
         async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                    num_perm=128) as lsh:
             await lsh.insert("m", m)
             fs = (ht.keys() for ht in lsh.hashtables)
             hashtables = await asyncio.gather(*fs)
             sizes = [len(H) for H in hashtables]
             self.assertTrue(all(sizes[0] == s for s in sizes))
Ejemplo n.º 11
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
async def run_async_test(data: list, batch_size: int):
    executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)
    lsh = MinHashLSH(storage_config=syncSTORAGE_CONFIG_REDIS,
                     threshold=0.5,
                     num_perm=16)
    await aioinsert_syncredis_with_executor(lsh, data, executor)
    await aioquery_syncredis(lsh, data, executor)

    async with AsyncMinHashLSH(storage_config=aioSTORAGE_CONFIG_REDIS,
                               threshold=0.5,
                               num_perm=16) as lsh2:
        await insert_aioredis(lsh2, data)

    async with AsyncMinHashLSH(storage_config=aioSTORAGE_CONFIG_REDIS,
                               threshold=0.5,
                               num_perm=16) as lsh3:
        await insertion_session_aioredis(lsh3, data, batch_size)
        await query_aioredis(lsh3, data)

    dsn = 'redis://{host}:{port}'.format(**aioSTORAGE_CONFIG_REDIS['redis'])
    redis = await aioredis.create_redis(dsn)
    await redis.flushdb()
    redis.close()
    await redis.wait_closed()
Ejemplo n.º 13
0
    async def test_pickle_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            pickled = pickle.dumps(lsh)

        async with pickle.loads(pickled) as lsh2:
            result = await lsh2.query(m1)
            self.assertTrue("a" in result)
            result = await lsh2.query(m2)
            self.assertTrue("b" in result)
            await lsh2.close()
Ejemplo n.º 14
0
 async def test__H_redis(self):
     """
     Check _H output consistent bytes length given
     the same concatenated hash value size
     """
     for _ in range(2, 128 + 1, 16):
         m = MinHash()
         m.update("abcdefg".encode("utf8"))
         m.update("1234567".encode("utf8"))
         async with AsyncMinHashLSH(
                 storage_config=self._storage_config_redis,
                 num_perm=128) as lsh:
             await lsh.insert("m", m)
             sizes = [
                 len(H) for ht in lsh.hashtables for H in await ht.keys()
             ]
             self.assertTrue(all(sizes[0] == s for s in sizes))
Ejemplo n.º 15
0
    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            mg = WeightedMinHashGenerator(10, 5)
            m3 = mg.minhash(np.random.uniform(1, 10, 10))

            with self.assertRaises(ValueError):
                await lsh.query(m3)
Ejemplo n.º 16
0
    async def test_arbitrary_collection(self):
        self._storage_config_mongo["mongo"][
            "collection_name"] = "unit_test_collection"
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update(b"a")
            await lsh.insert("a", m1)

        dsn = MONGO_URL or "mongodb://{host}:{port}/{db}".format(
            **self._storage_config_mongo["mongo"])
        collection = AsyncIOMotorClient(dsn).get_default_database(
            "lsh_test").get_collection("unit_test_collection")
        count = await collection.count_documents({})

        self.assertGreaterEqual(count, 1)
        del self._storage_config_mongo["mongo"]["collection_name"]
Ejemplo n.º 17
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")
Ejemplo n.º 18
0
    async def test_arbitrary_url(self):
        config = {
            "type": "aiomongo",
            "mongo": {
                "url": MONGO_URL or "mongodb://localhost/lsh_test"
            }
        }
        async with AsyncMinHashLSH(storage_config=config,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update(b"a")
            await lsh.insert("a", m1)

        database = AsyncIOMotorClient(
            config["mongo"]["url"]).get_default_database("lsh_test")
        collection_names = await database.list_collection_names()
        self.assertGreater(len(collection_names), 0)
        await database.client.drop_database(database.name)
Ejemplo n.º 19
0
    async def test_query_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            m3 = MinHash(16)
            m3.update("b".encode("utf8"))
            fs = (lsh.insert("a", m1, check_duplication=False), lsh.insert("b", m2, check_duplication=False),
                  lsh.insert("b", m3, check_duplication=False))
            await asyncio.gather(*fs)
            result = await lsh.query(m1)
            self.assertTrue("a" in result)
            result = await lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            with self.assertRaises(ValueError):
                await lsh.query(m3)
Ejemplo n.º 20
0
    async def test_remove_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5,
                                   num_perm=16) as lsh:
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)

            await lsh.remove("a")
            self.assertTrue(not await lsh.has_key("a"))
            for table in lsh.hashtables:
                for H in await table.keys():
                    self.assertGreater(len(await table.get(H)), 0)
                    self.assertTrue("a" not in await table.get(H))

            with self.assertRaises(ValueError):
                await lsh.remove("c")
Ejemplo n.º 21
0
Archivo: lsh.py Proyecto: Fatalll/KBQA
async def func():
    async with AsyncMinHashLSH(threshold=0.3,
                               num_perm=256,
                               storage_config={
                                   'type': 'aiomongo',
                                   'basename': 'k'.encode('utf8'),
                                   'mongo': {
                                       'host': 'localhost',
                                       'port': 27017,
                                       'db': 'lsh'
                                   }
                               }) as lsh:
        with codecs.open('dataset.csv', 'r', "utf-8") as file:
            for line in file:
                record = line.split(',', 1)
                test = record[0].lower().split()
                mh = MinHash(num_perm=256)
                for d in test:
                    mh.update(d.encode('utf8'))
                result = await lsh.query(mh)
                print(record[0], record[1], result)
Ejemplo n.º 22
0
 async def test_insertion_session_redis(self):
     async with AsyncMinHashLSH(storage_config=self._storage_config_redis,
                                threshold=0.5,
                                num_perm=16) as lsh:
         m1 = MinHash(16)
         m1.update("a".encode("utf8"))
         m2 = MinHash(16)
         m2.update("b".encode("utf8"))
         data = [("a", m1), ("b", m2)]
         async with lsh.insertion_session() as session:
             for key, minhash in data:
                 await session.insert(key, minhash)
         for t in lsh.hashtables:
             self.assertTrue(await t.size() >= 1)
             items = []
             for H in await t.keys():
                 items.extend(await t.get(H))
             self.assertTrue(pickle.dumps("a") in items)
             self.assertTrue(pickle.dumps("b") in items)
         self.assertTrue(await lsh.has_key("a"))
         self.assertTrue(await lsh.has_key("b"))
         for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))):
             res = await lsh.hashtables[i].get(H)
             self.assertTrue(pickle.dumps("a") in res)
async def insert_aioredis(aiolsh: AsyncMinHashLSH, data: list):
    fs = (aiolsh.insert(key, minhash, check_duplication=False)
          for key, minhash in data)
    await asyncio.gather(*fs)
async def query_aioredis(aiolsh: AsyncMinHashLSH, data: list):
    fs = (aiolsh.query(minhash) for key, minhash in data)
    return await asyncio.gather(*fs)
async def insertion_session_aioredis(aiolsh: AsyncMinHashLSH, data: list,
                                     batch_size: int):
    async with aiolsh.insertion_session(batch_size=batch_size) as session:
        fs = (session.insert(key, minhash, check_duplication=False)
              for key, minhash in data)
        await asyncio.gather(*fs)