def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data)
def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename)
def test_searcher_lmdb(): import shutil data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) lmbd_test_path = './test_lopq_lmbd' q = np.ones(8) # Test add_data searcher = LOPQSearcherLMDB(m, lmbd_test_path) searcher.add_data(data) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path) # Test add_codes searcher = LOPQSearcherLMDB(m, lmbd_test_path) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path)
def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename)
def cluster(self): self.data = self.index self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(self.data, n_init=1) # replace self.data by train for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) self.searcher.add_data(self.data)
def test_searcher_lmdb(): import shutil data = pkl.load(open(relpath('./testdata/test_searcher_data.pkl'))) m = LOPQModel.load_proto(relpath('./testdata/random_test_model.lopq')) lmbd_test_path = './test_lopq_lmbd' q = np.ones(8) # Test add_data searcher = LOPQSearcherLMDB(m, lmbd_test_path) searcher.add_data(data) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path) # Test add_codes searcher = LOPQSearcherLMDB(m, lmbd_test_path) codes = [m.predict(x) for x in data] searcher.add_codes(codes) searcher_instance_battery(searcher, q) # Clean up shutil.rmtree(lmbd_test_path)
def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data,self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i,e in enumerate(self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data)
def init_searcher(self): """ Initialize LOPQ model and searcher from `global_conf` value. """ try: # Try to load pretrained model from storer lopq_model = self.storer.load(self.build_model_str()) if lopq_model is None: raise ValueError("Could not load model from storer.") # if self.verbose > 1: # print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape)) # print("pca_P.shape: {}".format(lopq_model.pca_P.shape)) except Exception as inst: if type(inst) != ValueError: full_trace_error(inst) print("[{}: log] Looks like model was not trained yet ({})".format( self.pp, inst)) self.loaded_pretrain_model = False # Try to get it from public bucket e.g.: # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000 if self.get_pretrained_model: log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) from ..common.dl import download_file import pickle try: base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/" # This can fail with a "retrieval incomplete: got only" ... download_file(base_model_path + self.build_model_str(), self.build_model_str()) lopq_model = pickle.load(open(self.build_model_str(), 'rb')) # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket is_s3_storer = isinstance(self.storer, S3Storer) if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex": log_msg = "[{}: log] Skipping saving model {} back to s3" print(log_msg.format(self.pp, self.build_model_str())) else: log_msg = "[{}: log] Saving model {} to storer" print(log_msg.format(self.pp, self.build_model_str())) self.storer.save(self.build_model_str(), lopq_model) log_msg = "[{}: log] Loaded pretrained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) self.loaded_pretrain_model = True except Exception as inst: log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}" #print(log_msg.format(self.pp, self.build_model_str(), inst)) full_trace_error( log_msg.format(self.pp, self.build_model_str(), inst)) sys.stdout.flush() else: log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested." print(log_msg.format(self.pp, self.build_model_str())) if not self.loaded_pretrain_model: # This is from our modified LOPQ package... # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python # 'LOPQModelPCA' could be the type of the model loaded from pickle file # from lopq.model import LOPQModel, LOPQModelPCA # Size of DB should depend on nb_train... How should we properly set the size of this? # It should be nb_train_pca * size_feat + nb_train * size_feat_pca feat_size = get_feat_size(self.featurizer_type) if self.model_type == "lopq_pca": map_size = self.nb_train_pca * feat_size * 4 * 8 map_size += self.nb_train * self.model_params['pca'] * 4 * 8 else: map_size = self.nb_train * feat_size * 4 * 8 self.save_feat_env = lmdb.open('/data/lmdb_feats_' + self.build_model_str(), map_size=int(1.1 * map_size), writemap=True, map_async=True, max_dbs=2) # Train and save model in save_path folder lopq_model = self.train_index() # TODO: we could build a more unique model identifier # (using domain information? sha1/md5 of model parameters? using date of training?) # that would also mean we should list from the storer and guess # (based on date of creation) the correct model above... self.storer.save(self.build_model_str(), lopq_model) # Setup searcher with LOPQ model if lopq_model: # LOPQSearcherLMDB is now the default, as it makes the index more persistent # and potentially more easily usable with multiple processes. if self.lopq_searcher == "LOPQSearcherLMDB": from lopq.search import LOPQSearcherLMDB # TODO: should we get path from a parameter? and/or add model_str to it? # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str) # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.searcher = LOPQSearcherLMDB( lopq_model, lmdb_path='/data/lmdb_index_' + self.build_model_str(), id_lambda=str) # How could we properly set the size of this? self.updates_env = lmdb.open('/data/lmdb_updates_' + self.build_model_str(), map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.updates_index_db = self.updates_env.open_db("updates") elif self.lopq_searcher == "LOPQSearcher": from lopq.search import LOPQSearcher self.searcher = LOPQSearcher(lopq_model) else: raise ValueError("Unknown 'lopq_searcher' type: {}".format( self.lopq_searcher))
class Clustering(object): def __init__(self, fnames, n_components, model_proto_filename, m, v, sub, test_mode=False, dc=None): """ Simplify this mess haivng a seperate create vs load/init """ data = [] self.dc = dc self.fnames = fnames self.entries = [] for fname in fnames: nmat = np.load(fname) if nmat.ndim > 2: nmat = nmat.squeeze() data.append(nmat) for e in json.load(file(fname.replace('npy', 'json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = test_mode self.n_components = n_components self.m = m self.v = v self.sub = sub self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = model_proto_filename self.P_filename = model_proto_filename.replace('.proto', '.P.npy') self.mu_filename = model_proto_filename.replace('.proto', '.mu.npy') self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = model_proto_filename.replace( '.proto', '_lmdb') self.permuted_inds_filename = model_proto_filename.replace( '.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): self.model.export_proto(self.model_proto_filename) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results
class Clustering(object): def __init__(self,fnames,n_components,model_proto_filename,m,v,sub,test_mode=False,dc=None): """ Simplify this mess haivng a seperate create vs load/init """ data = [] self.dc = dc self.fnames = fnames self.entries = [] for fname in fnames: nmat = np.load(fname) if nmat.ndim > 2: nmat = nmat.squeeze() data.append(nmat) for e in json.load(file(fname.replace('npy','json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = test_mode self.n_components = n_components self.m = m self.v = v self.sub = sub self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = model_proto_filename self.P_filename = model_proto_filename.replace('.proto','.P.npy') self.mu_filename = model_proto_filename.replace('.proto','.mu.npy') self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = model_proto_filename.replace('.proto', '_lmdb') self.permuted_inds_filename = model_proto_filename.replace('.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data,self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i,e in enumerate(self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data) def find(self): i,selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i],10): print k def save(self): self.model.export_proto(self.model_proto_filename) with open(self.pca_filename,'w') as out: pickle.dump(self.pca_reduction,out) with open(self.P_filename, 'w') as out: np.save(out,self.P) with open(self.mu_filename, 'w') as out: np.save(out,self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds,out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename) def apply(self,vector,count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector,quota=count) else: results = None return codes.coarse,codes.fine,results
class LOPQRetriever(BaseRetriever): def __init__(self, name, args): data = [] self.name = name self.fnames = args.get('fnames', []) self.entries = [] for fname in self.fnames: nmat = np.load(fname) if nmat.ndim > 2: logging.info("squeezing shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) nmat = nmat.squeeze(axis=1) elif nmat.ndim == 1: logging.info("expanding shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) nmat = np.expand_dims(nmat, axis=0) else: logging.info( "keeping same shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) data.append(nmat) for e in json.load(file(fname.replace('npy', 'json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = args.get('test_mode', False) self.n_components = int(args['components']) self.m = int(args['m']) self.v = int(args['v']) self.sub = int(args['sub']) self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = args['proto_filename'] self.P_filename = args['proto_filename'].replace('.proto', '.P.npy') self.mu_filename = args['proto_filename'].replace('.proto', '.mu.npy') self.pca_filename = args['proto_filename'].replace( '.proto', '.pca.pkl') self.model_lmdb_filename = args['proto_filename'].replace( '.proto', '_lmdb') self.permuted_inds_filename = args['proto_filename'].replace( '.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) # train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(self.data, n_init=1) # replace self.data by train for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) # if self.test_mode: # self.searcher.add_data(train) # nns = compute_all_neighbors(test, train) # recall, _ = get_recall(self.searcher, test, nns) # print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) # else: self.searcher.add_data(self.data) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): with open(self.model_proto_filename, 'w') as f: self.model.export_proto(f) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None): results = [] coarse, fine, results_indexes = self.apply(vector, n) for i, k in enumerate(results_indexes[0]): e = entry_getter(k.id, retriever_pk) if e.detection_id: results.append({ 'rank': i + 1, 'dist': i, 'detection_primary_key': e.detection_id, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'detection', }) else: results.append({ 'rank': i + 1, 'dist': i, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'frame', }) return results
class LOPQRetriever(BaseRetriever): def __init__(self, name, proto_filename, args, test_mode=False): super(BaseRetriever, self).__init__() self.name = name self.proto_filename = proto_filename self.entries = [] self.test_mode = test_mode self.n_components = int(args['components']) self.m = int(args['m']) self.v = int(args['v']) self.sub = int(args['sub']) self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.permuted_inds = None self.model_proto_filename = proto_filename self.P_filename = proto_filename.replace('.proto', '.P.npy') self.entries_filename = proto_filename.replace('.proto', '.json') self.mu_filename = proto_filename.replace('.proto', '.mu.npy') self.pca_filename = proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = proto_filename.replace('.proto', '_lmdb') self.permuted_inds_filename = proto_filename.replace( '.proto', '.permuted_inds.pkl') def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across sub vectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.data = self.index self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(self.data, n_init=1) # replace self.data by train for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) self.searcher.add_data(self.data) # cluster_codes = [] # for e in c.entries: # cc.video_id = e['video_primary_key'] # if 'detection_primary_key' in e: # cc.detection_id = e['detection_primary_key'] # cc.frame_id = Region.objects.get(pk=cc.detection_id).frame_id # else: # cc.frame_id = e['frame_primary_key'] # cc.clusters = dc # cc.coarse = e['coarse'] # cc.fine = e['fine'] # cc.coarse_text = " ".join(map(str, e['coarse'])) # cc.fine_text = " ".join(map(str, e['fine'])) # cc.searcher_index = e['index'] # cluster_codes.append(cc) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): with open(self.model_proto_filename, 'w') as f: self.model.export_proto(f) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.entries_filename, 'w') as out: json.dump(out, self.entries) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None): results = [] coarse, fine, results_indexes = self.apply(vector, n) for i, k in enumerate(results_indexes[0]): e = entry_getter(k.id, retriever_pk) if e.detection_id: results.append({ 'rank': i + 1, 'dist': i, 'detection_primary_key': e.detection_id, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'detection', }) else: results.append({ 'rank': i + 1, 'dist': i, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'frame', }) return results