Example #1
0
 def cluster(self):
     self.pca_reduction = PCA(n_components=self.n_components)
     self.pca_reduction.fit(self.data)
     self.data = self.pca_reduction.transform(self.data)
     self.P, self.mu = self.pca()
     self.data = self.data - self.mu
     self.data = np.dot(self.data, self.P)
     train, test = train_test_split(self.data, test_size=0.2)
     self.model = LOPQModel(V=self.v,
                            M=self.m,
                            subquantizer_clusters=self.sub)
     self.model.fit(train, n_init=1)
     for i, e in enumerate(
             self.entries):  # avoid doing this twice again in searcher
         r = self.model.predict(self.data[i])
         e['coarse'] = r.coarse
         e['fine'] = r.fine
         e['index'] = i
     self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
     if self.test_mode:
         self.searcher.add_data(train)
         nns = compute_all_neighbors(test, train)
         recall, _ = get_recall(self.searcher, test, nns)
         print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
             self.model.V, self.model.M, self.model.subquantizer_clusters,
             str(recall))
     else:
         self.searcher.add_data(self.data)
Example #2
0
 def load(self):
     self.model = LOPQModel.load_proto(self.model_proto_filename)
     self.pca_reduction = pickle.load(file(self.pca_filename))
     self.P = np.load(file(self.P_filename))
     self.mu = np.load(file(self.mu_filename))
     self.permuted_inds = np.load(file(self.permuted_inds_filename))
     self.searcher = LOPQSearcherLMDB(model=self.model,
                                      lmdb_path=self.model_lmdb_filename)
Example #3
0
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Example #4
0
 def load(self):
     self.model = LOPQModel.load_proto(self.model_proto_filename)
     self.pca_reduction = pickle.load(file(self.pca_filename))
     self.P = np.load(file(self.P_filename))
     self.mu = np.load(file(self.mu_filename))
     self.permuted_inds = np.load(file(self.permuted_inds_filename))
     self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename)
Example #5
0
 def cluster(self):
     self.data = self.index
     self.pca_reduction = PCA(n_components=self.n_components)
     self.pca_reduction.fit(self.data)
     self.data = self.pca_reduction.transform(self.data)
     self.P, self.mu = self.pca()
     self.data = self.data - self.mu
     self.data = np.dot(self.data, self.P)
     self.model = LOPQModel(V=self.v,
                            M=self.m,
                            subquantizer_clusters=self.sub)
     self.model.fit(self.data, n_init=1)  # replace self.data by train
     for i, e in enumerate(
             self.entries):  # avoid doing this twice again in searcher
         r = self.model.predict(self.data[i])
         e['coarse'] = r.coarse
         e['fine'] = r.fine
         e['index'] = i
     self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
     self.searcher.add_data(self.data)
Example #6
0
File: tests.py Project: CVML/lopq
def test_searcher_lmdb():
    import shutil

    data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl')))
    m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq'))

    lmbd_test_path = './test_lopq_lmbd'
    q = np.ones(8)

    # Test add_data
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    searcher.add_data(data)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)

    # Test add_codes
    searcher = LOPQSearcherLMDB(m, lmbd_test_path)
    codes = [m.predict(x) for x in data]
    searcher.add_codes(codes)
    searcher_instance_battery(searcher, q)

    # Clean up
    shutil.rmtree(lmbd_test_path)
Example #7
0
 def cluster(self):
     self.pca_reduction = PCA(n_components=self.n_components)
     self.pca_reduction.fit(self.data)
     self.data = self.pca_reduction.transform(self.data)
     self.P, self.mu = self.pca()
     self.data = self.data - self.mu
     self.data = np.dot(self.data,self.P)
     train, test = train_test_split(self.data, test_size=0.2)
     self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub)
     self.model.fit(train, n_init=1)
     for i,e in enumerate(self.entries): # avoid doing this twice again in searcher
         r = self.model.predict(self.data[i])
         e['coarse'] = r.coarse
         e['fine'] = r.fine
         e['index'] = i
     self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename)
     if self.test_mode:
         self.searcher.add_data(train)
         nns = compute_all_neighbors(test, train)
         recall, _ = get_recall(self.searcher, test, nns)
         print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
     else:
         self.searcher.add_data(self.data)
    def init_searcher(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
    """
        try:
            # Try to load pretrained model from storer
            lopq_model = self.storer.load(self.build_model_str())
            if lopq_model is None:
                raise ValueError("Could not load model from storer.")
            # if self.verbose > 1:
            #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
            #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
        except Exception as inst:
            if type(inst) != ValueError:
                full_trace_error(inst)
            print("[{}: log] Looks like model was not trained yet ({})".format(
                self.pp, inst))

            self.loaded_pretrain_model = False
            # Try to get it from public bucket e.g.:
            # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
            if self.get_pretrained_model:
                log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
                print(log_msg.format(self.pp, self.build_model_str()))
                from ..common.dl import download_file
                import pickle
                try:
                    base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
                    # This can fail with a "retrieval incomplete: got only" ...
                    download_file(base_model_path + self.build_model_str(),
                                  self.build_model_str())
                    lopq_model = pickle.load(open(self.build_model_str(),
                                                  'rb'))
                    # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
                    is_s3_storer = isinstance(self.storer, S3Storer)
                    if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
                        log_msg = "[{}: log] Skipping saving model {} back to s3"
                        print(log_msg.format(self.pp, self.build_model_str()))
                    else:
                        log_msg = "[{}: log] Saving model {} to storer"
                        print(log_msg.format(self.pp, self.build_model_str()))
                        self.storer.save(self.build_model_str(), lopq_model)
                    log_msg = "[{}: log] Loaded pretrained model {} from s3"
                    print(log_msg.format(self.pp, self.build_model_str()))
                    self.loaded_pretrain_model = True
                except Exception as inst:
                    log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
                    #print(log_msg.format(self.pp, self.build_model_str(), inst))
                    full_trace_error(
                        log_msg.format(self.pp, self.build_model_str(), inst))
                    sys.stdout.flush()
            else:
                log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
                print(log_msg.format(self.pp, self.build_model_str()))

            if not self.loaded_pretrain_model:
                # This is from our modified LOPQ package...
                # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
                # 'LOPQModelPCA' could be the type of the model loaded from pickle file
                # from lopq.model import LOPQModel, LOPQModelPCA
                # Size of DB should depend on nb_train... How should we properly set the size of this?
                # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
                feat_size = get_feat_size(self.featurizer_type)
                if self.model_type == "lopq_pca":
                    map_size = self.nb_train_pca * feat_size * 4 * 8
                    map_size += self.nb_train * self.model_params['pca'] * 4 * 8
                else:
                    map_size = self.nb_train * feat_size * 4 * 8
                self.save_feat_env = lmdb.open('/data/lmdb_feats_' +
                                               self.build_model_str(),
                                               map_size=int(1.1 * map_size),
                                               writemap=True,
                                               map_async=True,
                                               max_dbs=2)

                # Train and save model in save_path folder
                lopq_model = self.train_index()
                # TODO: we could build a more unique model identifier
                # (using domain information? sha1/md5 of model parameters? using date of training?)
                # that would also mean we should list from the storer and guess
                # (based on date of creation) the correct model above...
                self.storer.save(self.build_model_str(), lopq_model)

        # Setup searcher with LOPQ model
        if lopq_model:
            # LOPQSearcherLMDB is now the default, as it makes the index more persistent
            # and potentially more easily usable with multiple processes.
            if self.lopq_searcher == "LOPQSearcherLMDB":
                from lopq.search import LOPQSearcherLMDB
                # TODO: should we get path from a parameter? and/or add model_str to it?
                # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str)
                # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1)
                self.searcher = LOPQSearcherLMDB(
                    lopq_model,
                    lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                    id_lambda=str)
                # How could we properly set the size of this?
                self.updates_env = lmdb.open('/data/lmdb_updates_' +
                                             self.build_model_str(),
                                             map_size=1024 * 1000000 * 1,
                                             writemap=True,
                                             map_async=True,
                                             max_dbs=1)
                self.updates_index_db = self.updates_env.open_db("updates")
            elif self.lopq_searcher == "LOPQSearcher":
                from lopq.search import LOPQSearcher
                self.searcher = LOPQSearcher(lopq_model)
            else:
                raise ValueError("Unknown 'lopq_searcher' type: {}".format(
                    self.lopq_searcher))
Example #9
0
class Clustering(object):
    def __init__(self,
                 fnames,
                 n_components,
                 model_proto_filename,
                 m,
                 v,
                 sub,
                 test_mode=False,
                 dc=None):
        """
        Simplify this mess haivng a seperate create vs load/init
        """
        data = []
        self.dc = dc
        self.fnames = fnames
        self.entries = []
        for fname in fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                nmat = nmat.squeeze()
            data.append(nmat)
            for e in json.load(file(fname.replace('npy', 'json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = test_mode
        self.n_components = n_components
        self.m = m
        self.v = v
        self.sub = sub
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = model_proto_filename
        self.P_filename = model_proto_filename.replace('.proto', '.P.npy')
        self.mu_filename = model_proto_filename.replace('.proto', '.mu.npy')
        self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = model_proto_filename.replace(
            '.proto', '_lmdb')
        self.permuted_inds_filename = model_proto_filename.replace(
            '.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(train, n_init=1)
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        if self.test_mode:
            self.searcher.add_data(train)
            nns = compute_all_neighbors(test, train)
            recall, _ = get_recall(self.searcher, test, nns)
            print 'Recall (V=%d, M=%d, subquants=%d): %s' % (
                self.model.V, self.model.M, self.model.subquantizer_clusters,
                str(recall))
        else:
            self.searcher.add_data(self.data)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        self.model.export_proto(self.model_proto_filename)
        with open(self.pca_filename, 'w') as out:
            pickle.dump(self.pca_reduction, out)
        with open(self.P_filename, 'w') as out:
            np.save(out, self.P)
        with open(self.mu_filename, 'w') as out:
            np.save(out, self.mu)
        with open(self.permuted_inds_filename, 'w') as out:
            pickle.dump(self.permuted_inds, out)
        self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results
Example #10
0
class Clustering(object):

    def __init__(self,fnames,n_components,model_proto_filename,m,v,sub,test_mode=False,dc=None):
        """
        Simplify this mess haivng a seperate create vs load/init
        """
        data = []
        self.dc = dc
        self.fnames = fnames
        self.entries = []
        for fname in fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                nmat = nmat.squeeze()
            data.append(nmat)
            for e in json.load(file(fname.replace('npy','json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = test_mode
        self.n_components = n_components
        self.m = m
        self.v = v
        self.sub = sub
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = model_proto_filename
        self.P_filename = model_proto_filename.replace('.proto','.P.npy')
        self.mu_filename = model_proto_filename.replace('.proto','.mu.npy')
        self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = model_proto_filename.replace('.proto', '_lmdb')
        self.permuted_inds_filename = model_proto_filename.replace('.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data,self.P)
        train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub)
        self.model.fit(train, n_init=1)
        for i,e in enumerate(self.entries): # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename)
        if self.test_mode:
            self.searcher.add_data(train)
            nns = compute_all_neighbors(test, train)
            recall, _ = get_recall(self.searcher, test, nns)
            print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
        else:
            self.searcher.add_data(self.data)

    def find(self):
        i,selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i],10):
            print k

    def save(self):
        self.model.export_proto(self.model_proto_filename)
        with open(self.pca_filename,'w') as out:
            pickle.dump(self.pca_reduction,out)
        with open(self.P_filename, 'w') as out:
            np.save(out,self.P)
        with open(self.mu_filename, 'w') as out:
            np.save(out,self.mu)
        with open(self.permuted_inds_filename, 'w') as out:
            pickle.dump(self.permuted_inds,out)
        self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename)

    def apply(self,vector,count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector,quota=count)
        else:
            results = None
        return codes.coarse,codes.fine,results
Example #11
0
class LOPQRetriever(BaseRetriever):
    def __init__(self, name, args):
        data = []
        self.name = name
        self.fnames = args.get('fnames', [])
        self.entries = []
        for fname in self.fnames:
            nmat = np.load(fname)
            if nmat.ndim > 2:
                logging.info("squeezing  shape {} with dimensions {}".format(
                    nmat.shape, nmat.ndim))
                nmat = nmat.squeeze(axis=1)
            elif nmat.ndim == 1:
                logging.info("expanding  shape {} with dimensions {}".format(
                    nmat.shape, nmat.ndim))
                nmat = np.expand_dims(nmat, axis=0)
            else:
                logging.info(
                    "keeping same  shape {} with dimensions {}".format(
                        nmat.shape, nmat.ndim))
            data.append(nmat)
            for e in json.load(file(fname.replace('npy', 'json'))):
                self.entries.append(e)
        if data:
            if len(data) > 1:
                self.data = np.concatenate(data)
            else:
                self.data = data[0]
            logging.info(self.data.shape)
        self.test_mode = args.get('test_mode', False)
        self.n_components = int(args['components'])
        self.m = int(args['m'])
        self.v = int(args['v'])
        self.sub = int(args['sub'])
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.model_proto_filename = args['proto_filename']
        self.P_filename = args['proto_filename'].replace('.proto', '.P.npy')
        self.mu_filename = args['proto_filename'].replace('.proto', '.mu.npy')
        self.pca_filename = args['proto_filename'].replace(
            '.proto', '.pca.pkl')
        self.model_lmdb_filename = args['proto_filename'].replace(
            '.proto', '_lmdb')
        self.permuted_inds_filename = args['proto_filename'].replace(
            '.proto', '.permuted_inds.pkl')
        self.permuted_inds = None

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        subvectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        # train, test = train_test_split(self.data, test_size=0.2)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(self.data, n_init=1)  # replace self.data by train
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        # if self.test_mode:
        #     self.searcher.add_data(train)
        #     nns = compute_all_neighbors(test, train)
        #     recall, _ = get_recall(self.searcher, test, nns)
        #     print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall))
        # else:
        self.searcher.add_data(self.data)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        with open(self.model_proto_filename, 'w') as f:
            self.model.export_proto(f)
            with open(self.pca_filename, 'w') as out:
                pickle.dump(self.pca_reduction, out)
            with open(self.P_filename, 'w') as out:
                np.save(out, self.P)
            with open(self.mu_filename, 'w') as out:
                np.save(out, self.mu)
            with open(self.permuted_inds_filename, 'w') as out:
                pickle.dump(self.permuted_inds, out)
            self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results

    def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None):
        results = []
        coarse, fine, results_indexes = self.apply(vector, n)
        for i, k in enumerate(results_indexes[0]):
            e = entry_getter(k.id, retriever_pk)
            if e.detection_id:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'detection_primary_key': e.detection_id,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'detection',
                })
            else:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'frame',
                })
        return results
Example #12
0
class LOPQRetriever(BaseRetriever):
    def __init__(self, name, proto_filename, args, test_mode=False):
        super(BaseRetriever, self).__init__()
        self.name = name
        self.proto_filename = proto_filename
        self.entries = []
        self.test_mode = test_mode
        self.n_components = int(args['components'])
        self.m = int(args['m'])
        self.v = int(args['v'])
        self.sub = int(args['sub'])
        self.model = None
        self.searcher = None
        self.pca_reduction = None
        self.P = None
        self.mu = None
        self.permuted_inds = None
        self.model_proto_filename = proto_filename
        self.P_filename = proto_filename.replace('.proto', '.P.npy')
        self.entries_filename = proto_filename.replace('.proto', '.json')
        self.mu_filename = proto_filename.replace('.proto', '.mu.npy')
        self.pca_filename = proto_filename.replace('.proto', '.pca.pkl')
        self.model_lmdb_filename = proto_filename.replace('.proto', '_lmdb')
        self.permuted_inds_filename = proto_filename.replace(
            '.proto', '.permuted_inds.pkl')

    def pca(self):
        """
        A simple PCA implementation that demonstrates how eigenvalue allocation
        is used to permute dimensions in order to balance the variance across
        sub vectors. There are plenty of PCA implementations elsewhere. What is
        important is that the eigenvalues can be used to compute a variance-balancing
        dimension permutation.
        """
        count, D = self.data.shape
        mu = self.data.sum(axis=0) / float(count)
        summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data,
                              np.zeros((D, D)))
        A = summed_covar / (count - 1) - np.outer(mu, mu)
        eigenvalues, P = np.linalg.eigh(A)
        self.permuted_inds = eigenvalue_allocation(2, eigenvalues)
        P = P[:, self.permuted_inds]
        return P, mu

    def cluster(self):
        self.data = self.index
        self.pca_reduction = PCA(n_components=self.n_components)
        self.pca_reduction.fit(self.data)
        self.data = self.pca_reduction.transform(self.data)
        self.P, self.mu = self.pca()
        self.data = self.data - self.mu
        self.data = np.dot(self.data, self.P)
        self.model = LOPQModel(V=self.v,
                               M=self.m,
                               subquantizer_clusters=self.sub)
        self.model.fit(self.data, n_init=1)  # replace self.data by train
        for i, e in enumerate(
                self.entries):  # avoid doing this twice again in searcher
            r = self.model.predict(self.data[i])
            e['coarse'] = r.coarse
            e['fine'] = r.fine
            e['index'] = i
        self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename)
        self.searcher.add_data(self.data)
        # cluster_codes = []
        # for e in c.entries:
        #     cc.video_id = e['video_primary_key']
        #     if 'detection_primary_key' in e:
        #         cc.detection_id = e['detection_primary_key']
        #         cc.frame_id = Region.objects.get(pk=cc.detection_id).frame_id
        #     else:
        #         cc.frame_id = e['frame_primary_key']
        #     cc.clusters = dc
        #     cc.coarse = e['coarse']
        #     cc.fine = e['fine']
        #     cc.coarse_text = " ".join(map(str, e['coarse']))
        #     cc.fine_text = " ".join(map(str, e['fine']))
        #     cc.searcher_index = e['index']
        #     cluster_codes.append(cc)

    def find(self):
        i, selected = random.choice([k for k in enumerate(self.entries)])
        print selected
        for k in self.searcher.get_result_quota(self.data[i], 10):
            print k

    def save(self):
        with open(self.model_proto_filename, 'w') as f:
            self.model.export_proto(f)
            with open(self.pca_filename, 'w') as out:
                pickle.dump(self.pca_reduction, out)
            with open(self.P_filename, 'w') as out:
                np.save(out, self.P)
            with open(self.mu_filename, 'w') as out:
                np.save(out, self.mu)
            with open(self.entries_filename, 'w') as out:
                json.dump(out, self.entries)
            with open(self.permuted_inds_filename, 'w') as out:
                pickle.dump(self.permuted_inds, out)
            self.searcher.env.close()

    def load(self):
        self.model = LOPQModel.load_proto(self.model_proto_filename)
        self.pca_reduction = pickle.load(file(self.pca_filename))
        self.P = np.load(file(self.P_filename))
        self.mu = np.load(file(self.mu_filename))
        self.permuted_inds = np.load(file(self.permuted_inds_filename))
        self.searcher = LOPQSearcherLMDB(model=self.model,
                                         lmdb_path=self.model_lmdb_filename)

    def apply(self, vector, count=None):
        vector = np.dot((self.pca_reduction.transform(vector) - self.mu),
                        self.P).transpose().squeeze()
        codes = self.model.predict(vector)
        if count:
            results = self.searcher.search(vector, quota=count)
        else:
            results = None
        return codes.coarse, codes.fine, results

    def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None):
        results = []
        coarse, fine, results_indexes = self.apply(vector, n)
        for i, k in enumerate(results_indexes[0]):
            e = entry_getter(k.id, retriever_pk)
            if e.detection_id:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'detection_primary_key': e.detection_id,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'detection',
                })
            else:
                results.append({
                    'rank': i + 1,
                    'dist': i,
                    'frame_index': e.frame.frame_index,
                    'frame_primary_key': e.frame_id,
                    'video_primary_key': e.video_id,
                    'type': 'frame',
                })
        return results