Exemple #1
0
def assert_dump_data(dump_path, docs, shards, pea_id):
    size_shard = len(docs) // shards
    size_shard_modulus = len(docs) % shards
    ids_dump, vectors_dump = import_vectors(
        dump_path,
        str(pea_id),
    )
    if pea_id == shards - 1:
        docs_expected = docs[
            (pea_id) * size_shard : (pea_id + 1) * size_shard + size_shard_modulus
        ]
    else:
        docs_expected = docs[(pea_id) * size_shard : (pea_id + 1) * size_shard]
    print(f'### pea {pea_id} has {len(docs_expected)} docs')

    # TODO these might fail if we implement any ordering of elements on dumping / reloading
    ids_dump = list(ids_dump)
    vectors_dump = list(vectors_dump)
    np.testing.assert_equal(ids_dump, [d.id for d in docs_expected])
    np.testing.assert_allclose(vectors_dump, [d.embedding for d in docs_expected])

    _, metas_dump = import_metas(
        dump_path,
        str(pea_id),
    )
    metas_dump = list(metas_dump)
    np.testing.assert_equal(
        metas_dump,
        [
            DBMSIndexDriver._doc_without_embedding(d).SerializeToString()
            for d in docs_expected
        ],
    )

    # assert with Indexers
    # TODO currently metas are only passed to the parent Compound, not to the inner components
    with TimeContext(f'### reloading {len(docs_expected)}'):
        # noinspection PyTypeChecker
        cp: CompoundQueryExecutor = BaseQueryIndexer.load_config(
            'indexer_query.yml',
            pea_id=pea_id,
            metas={
                'workspace': os.path.join(dump_path, 'new_ws'),
                'dump_path': dump_path,
            },
        )
    for c in cp.components:
        assert c.size == len(docs_expected)

    # test with the inner indexers separate from the Compound
    for i, indexer_file in enumerate(['query_np.yml', 'query_kv.yml']):
        indexer = BaseQueryIndexer.load_config(
            indexer_file,
            pea_id=pea_id,
            metas={
                'workspace': os.path.realpath(os.path.join(dump_path, f'new_ws-{i}')),
                'dump_path': dump_path,
            },
        )
        assert indexer.size == len(docs_expected)
Exemple #2
0
    def _load_dump(self, dump_path):
        """Load the dump at the path

        :param dump_path: the path of the dump"""
        ids, metas = import_metas(dump_path, str(self.pea_id))
        with self.get_create_handler() as write_handler:
            self._add(list(ids), list(metas), write_handler)
        # warming up
        self.query(['someid'])
Exemple #3
0
    def _load_dump(self, dump_path):
        """Load the dump at the path

        :param dump_path: the path of the dump"""
        ids, metas = import_metas(dump_path, str(self.pea_id))
        with self.write_handler as write_handler:
            self._add(list(ids), list(metas), write_handler)
        del self.write_handler
        self.handler_mutex = False
        self.is_handler_loaded = False
        # warming up
        self._query('someid')
Exemple #4
0
    def _load_dump(self, dump_path):
        """Load the dump at the path

        :param dump_path: the path of the dump"""
        ids, metas = import_metas(dump_path, str(self.pea_id))
        self._add(list(ids), list(metas))
        self.write_handler.flush()
        self.write_handler.close()
        self.handler_mutex = False
        self.is_handler_loaded = False
        del self.write_handler
        # warming up
        self._query('someid')