Esempio n. 1
0
 def __init__(self):
     Indexer.__init__(self)
     self.encoder = PQEncoder()
     self.storage = None
Esempio n. 2
0
File: vq.py Progetto: xdTin/hdidx
 def __init__(self):
     Indexer.__init__(self)
     self.encoder = PQEncoder()
     self.storage = None
Esempio n. 3
0
class PQIndexer(Indexer):
    def __init__(self):
        Indexer.__init__(self)
        self.encoder = PQEncoder()
        self.storage = None

    def __del__(self):
        pass

    def build(self, pardic=None):
        self.encoder.build(pardic)

    def set_storage(self, storage_type="mem", storage_parm=None):
        self.storage = createStorage(storage_type, storage_parm)

    def add(self, vals, keys=None):
        if self.storage is None:
            self.set_storage()

        num_vals = vals.shape[0]
        if keys is None:
            num_base_items = self.storage.get_num_items()
            keys = np.arange(num_base_items, num_base_items + num_vals, dtype=np.int32)
        else:
            keys = np.array(keys, dtype=np.int32).reshape(-1)

        start_id = 0
        for start_id in range(0, num_vals, self.BLKSIZE):
            cur_num = min(self.BLKSIZE, num_vals - start_id)
            logging.info("%8d/%d: %d" % (start_id, num_vals, cur_num))

            codes = self.encoder.encode(vals[start_id : start_id + cur_num])
            self.storage.add(codes, keys[start_id : start_id + cur_num])

    def remove(self, keys):
        raise Exception(self.ERR_UNIMPL)

    def search(self, queries, topk=None, **kwargs):
        nq = queries.shape[0]

        dsub = self.encoder.ecdat["dsub"]
        nsubq = self.encoder.ecdat["nsubq"]
        ksub = self.encoder.ecdat["ksub"]
        centroids = self.encoder.ecdat["centroids"]

        distab = np.zeros((nsubq, ksub), np.single)
        dis = np.ones((nq, topk), np.single) * np.inf
        ids = np.ones((nq, topk), np.int32) * -1

        profiler = Profiler()
        interval = 100 if nq >= 100 else 10
        time_total = 0.0  # total time for all queries
        logging.info("Start Querying ...")
        for qry_id in xrange(nq):
            profiler.start("distab")  # time for computing distance table
            # pre-compute the table of squared distance to centroids
            for qnt_id in range(nsubq):
                vsub = queries[qry_id : qry_id + 1, qnt_id * dsub : (qnt_id + 1) * dsub]
                distab[qnt_id : qnt_id + 1, :] = distFunc["euclidean"](centroids[qnt_id], vsub)
            profiler.end()

            profiler.start("distance")  # time for computing the distances
            # add the tabulated distances to construct the distance estimators
            idsquerybase, disquerybase = self.sumidxtab(distab)
            profiler.end()

            profiler.start("knn")  # time for finding the kNN
            realk = min(disquerybase.shape[0], topk)
            cur_ids = pq_knn(disquerybase, realk)
            profiler.end()

            profiler.start("result")  # time for getting final result
            ids[qry_id, :] = idsquerybase[cur_ids]
            dis[qry_id, :] = disquerybase[cur_ids]
            profiler.end()

            if (qry_id + 1) % interval == 0:
                time_total += profiler.sum_overall()
                logging.info("\t%d/%d: %.3fms per query" % (qry_id + 1, nq, profiler.sum_average() * 1000))
                logging.info("\t\t%s" % profiler.str_average())
                profiler.reset()
        logging.info("Querying Finished!")
        time_total += profiler.sum_overall()
        logging.info("Average querying time: %.3fms" % (time_total * 1000 / nq))

        return ids, dis

    def sumidxtab(self, D):
        """
        Compute distance to database items based on distances to centroids.
            D: nsubq x ksub
        """

        ids = self.storage.get_keys()
        dis = cext.sumidxtab_core(D, self.storage.get_codes())

        return np.array(ids), np.array(dis)

        """
        Deprecated code
        """
        # num_base_items = self.storage.get_num_items()

        # dis = np.zeros(num_base_items)
        # ids = np.arange(0)

        # start_id = 0
        # for keys, blk in self.storage:
        #     cur_num = blk.shape[0]
        #     # dis[start_id:start_id+cur_num] = self.sumidxtab_core(D, blk)
        #     dis[start_id:start_id+cur_num] = cext.sumidxtab_core(D, blk)
        #     start_id += cur_num
        #     ids = np.hstack((ids, keys))

        # return ids, dis

    @classmethod
    def sumidxtab_core(cls, D, blk):
        # return 0
        return [sum([D[j, blk[i, j]] for j in range(D.shape[0])]) for i in range(blk.shape[0])]
Esempio n. 4
0
File: vq.py Progetto: xdTin/hdidx
class PQIndexer(Indexer):
    def __init__(self):
        Indexer.__init__(self)
        self.encoder = PQEncoder()
        self.storage = None

    def __del__(self):
        pass

    def build(self, pardic=None):
        self.encoder.build(pardic)

    def set_storage(self, storage_type='mem', storage_parm=None):
        self.storage = createStorage(storage_type, storage_parm)

    def add(self, vals, keys=None):
        if self.storage is None:
            self.set_storage()

        num_vals = vals.shape[0]
        if keys is None:
            num_base_items = self.storage.get_num_items()
            keys = np.arange(num_base_items,
                             num_base_items + num_vals,
                             dtype=np.int32)
        else:
            keys = np.array(keys, dtype=np.int32).reshape(-1)

        start_id = 0
        for start_id in range(0, num_vals, self.BLKSIZE):
            cur_num = min(self.BLKSIZE, num_vals - start_id)
            logging.info("%8d/%d: %d" % (start_id, num_vals, cur_num))

            codes = self.encoder.encode(vals[start_id:start_id + cur_num])
            self.storage.add(codes, keys[start_id:start_id + cur_num])

    def remove(self, keys):
        raise Exception(self.ERR_UNIMPL)

    def search(self, queries, topk=None, **kwargs):
        nq = queries.shape[0]

        dsub = self.encoder.ecdat['dsub']
        nsubq = self.encoder.ecdat['nsubq']
        ksub = self.encoder.ecdat['ksub']
        centroids = self.encoder.ecdat['centroids']

        distab = np.zeros((nsubq, ksub), np.single)
        dis = np.ones((nq, topk), np.single) * np.inf
        ids = np.ones((nq, topk), np.int32) * -1

        profiler = Profiler()
        interval = 100 if nq >= 100 else 10
        time_total = 0.0  # total time for all queries
        logging.info('Start Querying ...')
        for qry_id in xrange(nq):
            profiler.start("distab")  # time for computing distance table
            # pre-compute the table of squared distance to centroids
            for qnt_id in range(nsubq):
                vsub = queries[qry_id:qry_id + 1,
                               qnt_id * dsub:(qnt_id + 1) * dsub]
                distab[qnt_id:qnt_id + 1, :] = distFunc['euclidean'](
                    centroids[qnt_id], vsub)
            profiler.end()

            profiler.start("distance")  # time for computing the distances
            # add the tabulated distances to construct the distance estimators
            idsquerybase, disquerybase = self.sumidxtab(distab)
            profiler.end()

            profiler.start("knn")  # time for finding the kNN
            realk = min(disquerybase.shape[0], topk)
            cur_ids = pq_knn(disquerybase, realk)
            profiler.end()

            profiler.start("result")  # time for getting final result
            ids[qry_id, :] = idsquerybase[cur_ids]
            dis[qry_id, :] = disquerybase[cur_ids]
            profiler.end()

            if (qry_id + 1) % interval == 0:
                time_total += profiler.sum_overall()
                logging.info('\t%d/%d: %.3fms per query' %
                             (qry_id + 1, nq, profiler.sum_average() * 1000))
                logging.info("\t\t%s" % profiler.str_average())
                profiler.reset()
        logging.info('Querying Finished!')
        time_total += profiler.sum_overall()
        logging.info("Average querying time: %.3fms" %
                     (time_total * 1000 / nq))

        return ids, dis

    def sumidxtab(self, D):
        """
        Compute distance to database items based on distances to centroids.
            D: nsubq x ksub
        """

        ids = self.storage.get_keys()
        dis = cext.sumidxtab_core(D, self.storage.get_codes())

        return np.array(ids), np.array(dis)
        """
        Deprecated code
        """
        # num_base_items = self.storage.get_num_items()

        # dis = np.zeros(num_base_items)
        # ids = np.arange(0)

        # start_id = 0
        # for keys, blk in self.storage:
        #     cur_num = blk.shape[0]
        #     # dis[start_id:start_id+cur_num] = self.sumidxtab_core(D, blk)
        #     dis[start_id:start_id+cur_num] = cext.sumidxtab_core(D, blk)
        #     start_id += cur_num
        #     ids = np.hstack((ids, keys))

        # return ids, dis

    @classmethod
    def sumidxtab_core(cls, D, blk):
        # return 0
        return [
            sum([D[j, blk[i, j]] for j in range(D.shape[0])])
            for i in range(blk.shape[0])
        ]