class ApproximateIndexer(object): def __init__(self,index_name,model_path,lmdb_path,V=16, M=16): self.model = LOPQModel(V,M) self.index_name = index_name self.searcher = None self.model_path = model_path self.lmdb_path = lmdb_path def load(self): self.model.load_proto(self.model_path) def fit(self,train): print train.shape self.pca_reduction = PCA(n_components=256) self.pca_reduction.fit(train) train = self.pca_reduction.transform(train) self.P, self.mu = pca(train) train = np.dot(train, self.P) print train.shape self.model.fit(train, n_init=1) def transform(self,test): print test.shape test = self.pca_reduction.transform(test) test = test - self.mu test = np.dot(test,self.P) print test.shape return test def fit_model(self,train): self.fit(train) self.model.export_proto(self.model_path) self.searcher = LOPQSearcher(self.model) # LOPQSearcherLMDB(self.model,self.lmdb_path) def experiment(self,data): train, test = train_test_split(data, test_size=0.1) print data.shape,train.shape,test.shape nns = compute_all_neighbors(test, train) self.fit_model(train) self.searcher.add_data(self.transform(train)) recall, _ = get_recall(self.searcher, self.transform(test), nns) print 'Recall (V={}, M={}, subquants={}): {}'.format(self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) def add_data(self,data): self.searcher.add_data(data) def search(self,x): return self.searcher.search(x,quota=100)
class Clustering(object): def __init__(self, fnames, n_components, model_proto_filename, m, v, sub, test_mode=False, dc=None): """ Simplify this mess haivng a seperate create vs load/init """ data = [] self.dc = dc self.fnames = fnames self.entries = [] for fname in fnames: nmat = np.load(fname) if nmat.ndim > 2: nmat = nmat.squeeze() data.append(nmat) for e in json.load(file(fname.replace('npy', 'json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = test_mode self.n_components = n_components self.m = m self.v = v self.sub = sub self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = model_proto_filename self.P_filename = model_proto_filename.replace('.proto', '.P.npy') self.mu_filename = model_proto_filename.replace('.proto', '.mu.npy') self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = model_proto_filename.replace( '.proto', '_lmdb') self.permuted_inds_filename = model_proto_filename.replace( '.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % ( self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): self.model.export_proto(self.model_proto_filename) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results
class Clustering(object): def __init__(self,fnames,n_components,model_proto_filename,m,v,sub,test_mode=False,dc=None): """ Simplify this mess haivng a seperate create vs load/init """ data = [] self.dc = dc self.fnames = fnames self.entries = [] for fname in fnames: nmat = np.load(fname) if nmat.ndim > 2: nmat = nmat.squeeze() data.append(nmat) for e in json.load(file(fname.replace('npy','json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = test_mode self.n_components = n_components self.m = m self.v = v self.sub = sub self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = model_proto_filename self.P_filename = model_proto_filename.replace('.proto','.P.npy') self.mu_filename = model_proto_filename.replace('.proto','.mu.npy') self.pca_filename = model_proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = model_proto_filename.replace('.proto', '_lmdb') self.permuted_inds_filename = model_proto_filename.replace('.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data,self.P) train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(train, n_init=1) for i,e in enumerate(self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model,self.model_lmdb_filename) if self.test_mode: self.searcher.add_data(train) nns = compute_all_neighbors(test, train) recall, _ = get_recall(self.searcher, test, nns) print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) else: self.searcher.add_data(self.data) def find(self): i,selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i],10): print k def save(self): self.model.export_proto(self.model_proto_filename) with open(self.pca_filename,'w') as out: pickle.dump(self.pca_reduction,out) with open(self.P_filename, 'w') as out: np.save(out,self.P) with open(self.mu_filename, 'w') as out: np.save(out,self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds,out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model,lmdb_path=self.model_lmdb_filename) def apply(self,vector,count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector,quota=count) else: results = None return codes.coarse,codes.fine,results
class LOPQTrainer(object): def __init__(self, name, components, m, v, sub, dirname, source_indexer_shasum): self.name = name self.n_components = int(components) self.m = int(m) self.v = int(v) self.dirname = dirname self.sub = int(sub) self.model = None self.pca_reduction = None self.P = None self.mu = None self.permuted_inds = None self.source_indexer_shasum = source_indexer_shasum def pca(self, training_data): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across sub vectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = training_data.shape mu = training_data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), training_data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def train(self, training_data): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(training_data) training_data = self.pca_reduction.transform(training_data) self.P, self.mu = self.pca(training_data) training_data = training_data - self.mu training_data = np.dot(training_data, self.P) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(training_data, n_init=1) # replace self.data by train def save(self): model_proto_filename = "{}/model.proto".format(self.dirname) P_filename = "{}/model.P.npy".format(self.dirname) mu_filename = "{}/model.mu.npy".format(self.dirname) pca_filename = "{}/model.pca.pkl".format(self.dirname) permind_filename = "{}/model.permind.pkl".format(self.dirname) with open(model_proto_filename, 'w') as f: self.model.export_proto(f) with open(pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(P_filename, 'w') as out: np.save(out, self.P) with open(mu_filename, 'w') as out: np.save(out, self.mu) with open(permind_filename, 'w') as out: pickle.dump(self.permuted_inds, out) j = { "name": self.name, "algorithm": "LOPQ", "shasum": hashlib.sha1(file(model_proto_filename).read()).hexdigest(), "model_type": "P", "arguments": { 'm': self.m, 'v': self.v, 'sub': self.sub, 'components': self.n_components, 'indexer_shasum': self.source_indexer_shasum }, "files": [{ "filename": "model.proto", "url": "{}/model.proto".format(self.dirname) }, { "filename": "model.P.npy", "url": "{}/model.P.npy".format(self.dirname) }, { "filename": "model.mu.npy", "url": "{}/model.mu.npy".format(self.dirname) }, { "filename": "model.pca.pkl", "url": "{}/model.pca.pkl".format(self.dirname) }, { "filename": "model.permind.pkl", "url": "{}/model.permind.pkl".format(self.dirname) }] } return j
class LOPQRetriever(BaseRetriever): def __init__(self, name, args): data = [] self.name = name self.fnames = args.get('fnames', []) self.entries = [] for fname in self.fnames: nmat = np.load(fname) if nmat.ndim > 2: logging.info("squeezing shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) nmat = nmat.squeeze(axis=1) elif nmat.ndim == 1: logging.info("expanding shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) nmat = np.expand_dims(nmat, axis=0) else: logging.info( "keeping same shape {} with dimensions {}".format( nmat.shape, nmat.ndim)) data.append(nmat) for e in json.load(file(fname.replace('npy', 'json'))): self.entries.append(e) if data: if len(data) > 1: self.data = np.concatenate(data) else: self.data = data[0] logging.info(self.data.shape) self.test_mode = args.get('test_mode', False) self.n_components = int(args['components']) self.m = int(args['m']) self.v = int(args['v']) self.sub = int(args['sub']) self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.model_proto_filename = args['proto_filename'] self.P_filename = args['proto_filename'].replace('.proto', '.P.npy') self.mu_filename = args['proto_filename'].replace('.proto', '.mu.npy') self.pca_filename = args['proto_filename'].replace( '.proto', '.pca.pkl') self.model_lmdb_filename = args['proto_filename'].replace( '.proto', '_lmdb') self.permuted_inds_filename = args['proto_filename'].replace( '.proto', '.permuted_inds.pkl') self.permuted_inds = None def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across subvectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) # train, test = train_test_split(self.data, test_size=0.2) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(self.data, n_init=1) # replace self.data by train for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) # if self.test_mode: # self.searcher.add_data(train) # nns = compute_all_neighbors(test, train) # recall, _ = get_recall(self.searcher, test, nns) # print 'Recall (V=%d, M=%d, subquants=%d): %s' % (self.model.V, self.model.M, self.model.subquantizer_clusters, str(recall)) # else: self.searcher.add_data(self.data) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): with open(self.model_proto_filename, 'w') as f: self.model.export_proto(f) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None): results = [] coarse, fine, results_indexes = self.apply(vector, n) for i, k in enumerate(results_indexes[0]): e = entry_getter(k.id, retriever_pk) if e.detection_id: results.append({ 'rank': i + 1, 'dist': i, 'detection_primary_key': e.detection_id, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'detection', }) else: results.append({ 'rank': i + 1, 'dist': i, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'frame', }) return results
class LOPQRetriever(BaseRetriever): def __init__(self, name, proto_filename, args, test_mode=False): super(BaseRetriever, self).__init__() self.name = name self.proto_filename = proto_filename self.entries = [] self.test_mode = test_mode self.n_components = int(args['components']) self.m = int(args['m']) self.v = int(args['v']) self.sub = int(args['sub']) self.model = None self.searcher = None self.pca_reduction = None self.P = None self.mu = None self.permuted_inds = None self.model_proto_filename = proto_filename self.P_filename = proto_filename.replace('.proto', '.P.npy') self.entries_filename = proto_filename.replace('.proto', '.json') self.mu_filename = proto_filename.replace('.proto', '.mu.npy') self.pca_filename = proto_filename.replace('.proto', '.pca.pkl') self.model_lmdb_filename = proto_filename.replace('.proto', '_lmdb') self.permuted_inds_filename = proto_filename.replace( '.proto', '.permuted_inds.pkl') def pca(self): """ A simple PCA implementation that demonstrates how eigenvalue allocation is used to permute dimensions in order to balance the variance across sub vectors. There are plenty of PCA implementations elsewhere. What is important is that the eigenvalues can be used to compute a variance-balancing dimension permutation. """ count, D = self.data.shape mu = self.data.sum(axis=0) / float(count) summed_covar = reduce(lambda acc, x: acc + np.outer(x, x), self.data, np.zeros((D, D))) A = summed_covar / (count - 1) - np.outer(mu, mu) eigenvalues, P = np.linalg.eigh(A) self.permuted_inds = eigenvalue_allocation(2, eigenvalues) P = P[:, self.permuted_inds] return P, mu def cluster(self): self.data = self.index self.pca_reduction = PCA(n_components=self.n_components) self.pca_reduction.fit(self.data) self.data = self.pca_reduction.transform(self.data) self.P, self.mu = self.pca() self.data = self.data - self.mu self.data = np.dot(self.data, self.P) self.model = LOPQModel(V=self.v, M=self.m, subquantizer_clusters=self.sub) self.model.fit(self.data, n_init=1) # replace self.data by train for i, e in enumerate( self.entries): # avoid doing this twice again in searcher r = self.model.predict(self.data[i]) e['coarse'] = r.coarse e['fine'] = r.fine e['index'] = i self.searcher = LOPQSearcherLMDB(self.model, self.model_lmdb_filename) self.searcher.add_data(self.data) # cluster_codes = [] # for e in c.entries: # cc.video_id = e['video_primary_key'] # if 'detection_primary_key' in e: # cc.detection_id = e['detection_primary_key'] # cc.frame_id = Region.objects.get(pk=cc.detection_id).frame_id # else: # cc.frame_id = e['frame_primary_key'] # cc.clusters = dc # cc.coarse = e['coarse'] # cc.fine = e['fine'] # cc.coarse_text = " ".join(map(str, e['coarse'])) # cc.fine_text = " ".join(map(str, e['fine'])) # cc.searcher_index = e['index'] # cluster_codes.append(cc) def find(self): i, selected = random.choice([k for k in enumerate(self.entries)]) print selected for k in self.searcher.get_result_quota(self.data[i], 10): print k def save(self): with open(self.model_proto_filename, 'w') as f: self.model.export_proto(f) with open(self.pca_filename, 'w') as out: pickle.dump(self.pca_reduction, out) with open(self.P_filename, 'w') as out: np.save(out, self.P) with open(self.mu_filename, 'w') as out: np.save(out, self.mu) with open(self.entries_filename, 'w') as out: json.dump(out, self.entries) with open(self.permuted_inds_filename, 'w') as out: pickle.dump(self.permuted_inds, out) self.searcher.env.close() def load(self): self.model = LOPQModel.load_proto(self.model_proto_filename) self.pca_reduction = pickle.load(file(self.pca_filename)) self.P = np.load(file(self.P_filename)) self.mu = np.load(file(self.mu_filename)) self.permuted_inds = np.load(file(self.permuted_inds_filename)) self.searcher = LOPQSearcherLMDB(model=self.model, lmdb_path=self.model_lmdb_filename) def apply(self, vector, count=None): vector = np.dot((self.pca_reduction.transform(vector) - self.mu), self.P).transpose().squeeze() codes = self.model.predict(vector) if count: results = self.searcher.search(vector, quota=count) else: results = None return codes.coarse, codes.fine, results def nearest(self, vector=None, n=12, retriever_pk=None, entry_getter=None): results = [] coarse, fine, results_indexes = self.apply(vector, n) for i, k in enumerate(results_indexes[0]): e = entry_getter(k.id, retriever_pk) if e.detection_id: results.append({ 'rank': i + 1, 'dist': i, 'detection_primary_key': e.detection_id, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'detection', }) else: results.append({ 'rank': i + 1, 'dist': i, 'frame_index': e.frame.frame_index, 'frame_primary_key': e.frame_id, 'video_primary_key': e.video_id, 'type': 'frame', }) return results