def vladPure(data, means, assignments, parallel, components, normalize=['l2c'], covars=None, skew=None): def encode(k): vk_ = None sk_ = None possible = data[assignments[:, k] > 0] clustermass = len(possible) if clustermass > 0: agg = np.sum(possible, axis=0) uk_ = agg - clustermass * means[k] else: uk_ = np.zeros(data.shape[1], dtype=data.dtype) if 'l2c' in normalize: n = max(math.sqrt(enc.dot(enc)), 1e-12) enc /= n return enc if parallel: uk = pc.parmap(encode, range(components)) else: uk = map(encode, range(components)) uk = np.concatenate(uk).reshape(1, -1) return uk # * assignments.sum()
def computeStats(name, dist_matrix, labels, parallel=True): """ compute TOP1 and mAP of dist_matrix via given labels """ num_descr = dist_matrix.shape[0] if parallel: def sortdist(split): return split.argsort() splits = np.array_split(dist_matrix, 8) # todo assume 8 threads indices = pc.parmap(sortdist, splits) indices = np.concatenate(indices, axis=0) else: indices = dist_matrix.argsort() def loop_descr(r): # compute TOP-1 accuracy (AP) correct = 0 for k in xrange(1): if labels[indices[r, k]] == labels[r]: correct += 1 # compute mAP rel = 0 avg_precision = [] for k in range(0, num_descr - 1): # don't take last one, since this is the # element itself if labels[indices[r, k]] == labels[r]: rel += 1 avg_precision.append(rel / float(k + 1)) return correct, np.mean(np.array(avg_precision)) if parallel: top1_correct, query_precisions = zip( *pc.parmap(loop_descr, range(num_descr))) else: top1_correct, query_precisions = zip( *map(loop_descr, range(num_descr))) top1 = float(np.array(top1_correct).sum()) / float(num_descr) mAP = np.mean(np.array(query_precisions)) print "NN {:10} TOP-1: {:7} mAP: {:12}".format(name, top1, mAP) return top1, mAP
def computeExCls(descr, the_cls, n_cls, outputfolder=None, labels=None, suffix='_ecls.pkl.gz', parallel=True, nprocs=None, use_labels=False, files=None, load=False, return_none=False): if use_labels: assert (labels is not None) assert (len(descr) == len(labels)) labels = np.array(labels) # make sure we have a numpy array print 'computeExCls: shape', descr.shape, 'take ', n_cls widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=n_cls) def createEx(i): if use_labels: neg = descr[labels != labels[i]] else: neg = descr[np.arange(len(descr)) != i], fname = '' if outputfolder is not None and files is not None: if files[i].endswith('.pkl.gz'): fname = files[i].replace('.pkl.gz', suffix) else: fname = os.path.splitext(files[i])[0] + suffix fname = os.path.join(outputfolder, os.path.basename(fname)) if load and fname != '' and os.path.exists(fname): cls = pc.load(fname) progress.update(i + 1) if return_none: return None return cls cls = exemplar_cls.createExemplarCls(descr[i].reshape(1, -1), neg, the_cls) if fname != '': pc.dump(fname, cls, verbose=False) progress.update(i + 1) if return_none: return None return cls progress.start() if parallel: ex_cls = pc.parmap(createEx, range(n_cls), nprocs=nprocs) else: ex_cls = map(createEx, range(n_cls)) progress.finish() return ex_cls
def predictLoadECLS(descr_probe, folder, files, suffix='_ecls.pkl.gz', parallel=False, nprocs=None): print '=> predict by loading E-CLS' if np.isnan(descr_probe).any(): print 'WARNING have a nan in the descr_probe' if np.isinf(descr_probe).any(): print 'WARNING have a inf in the descr_probe' widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files)) def compute(i): if files[i].endswith('.pkl.gz'): fname = files[i].replace('.pkl.gz', suffix) else: fname = os.path.splitext(files[i])[0] + suffix fname = os.path.join(folder, os.path.basename(fname)) cls = pc.load(fname) if isinstance(cls, OneClassSVM): coef = cls.coef_new mag = np.sqrt(coef.flatten().dot(coef.flatten())) # sc = descr_probe.dot( (coef / mag).reshape(-1,1)) +\ # (cls.intercept_ / mag).reshape(-1,1) sc = descr_probe.dot((coef / mag).reshape(1, -1)) else: sc = cls.decision_function(descr_probe).reshape(1, -1) if np.isnan(sc).any(): print 'WARNING have a nan in sc' if np.isinf(sc).any(): print 'WARNING have a inf in sc' if sc.shape[1] != descr_probe.shape[0]: print '{}x{} dot {}x{}'.format(descr_probe.shape[0], descr_probe.shape[1], cls.coef_.shape[0], cls.coef_.shape[1]) raise ValueError('sc.shape[0] {} != descr.probe.shape[0]' ' {}'.format(sc.shape[0], descr_probe.shape[0])) progress.update(i + 1) return sc progress.start() if parallel: score = pc.parmap(compute, range(len(files)), nprocs=nprocs) else: score = map(compute, range(len(files))) all_scores = np.concatenate(score, axis=0) progress.finish() return all_scores
def computeStats( name, dist_matrix, labels, parallel=True ): """ compute TOP1 and mAP of dist_matrix via given labels """ num_descr = dist_matrix.shape[0] if parallel: def sortdist(split): return split.argsort() splits = np.array_split(dist_matrix, 8) # todo assume 8 threads indices = pc.parmap(sortdist, splits) indices = np.concatenate(indices, axis=0) else: indices = dist_matrix.argsort() def loop_descr(r): # compute TOP-1 accuracy (AP) correct = 0 for k in xrange(1): if labels[ indices[r,k] ] == labels[ r ] : correct += 1 # compute mAP rel = 0 avg_precision = [] for k in range(0,num_descr-1): # don't take last one, since this is the # element itself if labels[ indices[r,k] ] == labels[ r ]: rel += 1 avg_precision.append( rel / float(k+1) ) return correct, np.mean(np.array(avg_precision)) if parallel: top1_correct, query_precisions = zip( *pc.parmap(loop_descr, range(num_descr)) ) else: top1_correct, query_precisions = zip( *map(loop_descr, range(num_descr)) ) top1 = float(np.array(top1_correct).sum()) / float(num_descr) mAP = np.mean(np.array(query_precisions)) print "NN {:10} TOP-1: {:7} mAP: {:12}".format(name, top1, mAP) return top1, mAP
def fisherFull(data, means, covars, weights, posteriors, parallel, accumulate=True): d = covars.shape[1] indices = np.triu(np.ones((d, d))).flatten().astype(np.bool) def encode(i): inv_cov = np.linalg.inv(covars[i]) diff = data - means[i] # compute means z = diff.dot(inv_cov) # compute covars covs = np.zeros((len(data), d * d), data.dtype) for a in range(d): # tmp = - z * np.roll(z, a, axis=1) \ # - 0.5 * ( (2*np.pi)**(-d) ) * \ # np.diag( np.roll(inv_cov, -a, axis=0) ) # print tmp.shape, covs.shape covs[ :, a*d:(a+1)*d ] = - z * np.roll(z, a, axis=1) \ - 0.5 * ( (2*np.pi)**(-d) ) * \ np.diag( np.roll(inv_cov, -a, axis=0) ) # just take the upper triangle matrix covs = covs[:, indices] # dub in posteriors if accumulate: weights_ = np.sum(posteriors[:, i] - weights[i]) means_ = posteriors[:, i].T.dot(z) covs_ = posteriors[:, i].T.dot(covs) else: weights_ = posteriors[:, i] - weights[i] means_ = posteriors[:, i].reshape(-1, 1) * z covs_ = posteriors[:, i].reshape(-1, 1) * covs weights_ /= (len(data) * math.sqrt(weights[i])) # TODO: Fisher information return weights_, means_, covs_ if parallel: wk_, uk_, vk_ = zip(*pc.parmap(encode, range(means.shape[0]))) else: wk_, uk_, vk_ = zip(*map(encode, range(means.shape[0]))) return wk_, uk_, vk_
def run(args): print '> compute tv space' files, _ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=args.exact) ubm = ubm_adaption.loadGMM(args.load_ubm) widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files)) print 'extract stats' def extract(i): descr = pc.loadDescriptors(files[i]) of = os.path.join( args.outputfolder, os.path.basename(files[i]).split('.', 1)[0] + '_stat.pkl.gz') if args.load_stats and os.path.exists(of): N, F = pc.load(of) else: N, F = compute_bw_stats.compute_bw_stats(descr, ubm, None, args.nbest) pc.dump(of, [N, F], verbose=False) if i == 0: print N.shape, F.shape progress.update(i + 1) return N.reshape(1, -1), F.reshape(1, -1) progress.start() if args.parallel: Ns, Fs = zip( *pc.parmap(extract, range(len(files)), nprocs=args.nprocs)) else: Ns, Fs = zip(*map(extract, range(len(files)))) progress.finish() Ns = np.concatenate(Ns, axis=0) Fs = np.concatenate(Fs, axis=0) print 'train tv from {} stats'.format(len(Ns)) tv = train_tv_space(Ns, Fs, ubm, args.tv_dim, args.tv_niter, args.parallel, args.nprocs) folder = os.path.join(args.outputfolder, 'tv.pkl.gz') pc.dump(folder, tv) return folder
def predict(files_probe, ex_cls, prep=None, ex_cls_bg=None, parallel=False, nprocs=None): print '| evaluate all E-cls (predict)' widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files_probe)) def predictProbe(i): probe_desc = pc.loadDescriptors(files_probe[i]) if prep: if i == 0: print 'pre descr[0]', probe_desc[0] probe_desc = prep.transform(probe_desc) if i == 0: print 'post descr[0]', probe_desc[0] if ex_cls_bg: # then use cls as attributes probe_desc = exemplar_cls.predictExemplarCls(probe_desc, ex_cls_bg) # probe_desc = convertToProbs(probe_desc, ab_list) df = exemplar_cls.predictExemplarCls(probe_desc, ex_cls) # df = convertToProbs(df, ab_list) # df = exemplar_cls.voteCls(df) progress.update(i + 1) return df progress.start() if parallel: scores = pc.parmap(predictProbe, range(len(files_probe)), nprocs=nprocs) else: scores = map(predictProbe, range(len(files_probe))) progress.finish() scores = np.concatenate(scores, axis=0) print '[Done]' return scores
def computeDistances(descriptors, method, parallel, nprocs, distance_func=None): num_desc = len(descriptors) indices = [(y, x) for y in range(num_desc - 1) for x in range(y + 1, num_desc)] splits = np.array_split(np.array(indices), 8) def loop(inds): dists = [] for ind in inds: if distance_func == None: try: dist = computeDistance(descriptors[ind[0]], descriptors[ind[1]], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func(descriptors[ind[0]], descriptors[ind[1]]) dists.append(dist) return dists if parallel: dists = pc.parmap(loop, splits, nprocs) else: dists = map(loop, splits) # convert densed vector-form to matrix dense_vector = np.concatenate(dists) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform(dense_vector) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # fill diagonal elements with max np.fill_diagonal(dist_matrix, np.finfo(float).max) return dist_matrix
def computeDistances2(descr_probe, descr_gallery, method, parallel=True, distance_func=None, nprocs=4): if np.isnan(descr_probe).any(): raise ValueError('nan in descr_probe!') if np.isinf(descr_probe).any(): raise ValueError('inf in descr_probe!') if np.isnan(descr_gallery).any(): raise ValueError('nan in descr_galler!') if np.isinf(descr_gallery).any(): raise ValueError('inf in descr_galler!') n_probes = len(descr_probe) n_gallery = len(descr_gallery) indices = [(y, x) for y in range(n_probes) for x in range(n_gallery)] def loop(ind): if distance_func == None: try: dist = computeDistance(descr_probe[ind[0]], descr_gallery[ind[1]], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func(descriptors[ind[0]], descriptors[ind[1]]) return dist if parallel: dists = pc.parmap(loop, indices, nprocs=nprocs) else: dists = map(loop, indices) dense_vector = np.array(dists).reshape(n_probes, -1) # do some checks if np.isnan(dense_vector).any(): print 'WARNING have a nan in the dist-matrix' if np.isinf(dense_vector).any(): print 'WARNING have a inf in the dist-matrix' return dense_vector
def vlad(data, means, assignments, parallel, components, normalize=['l2c', 'mass']): """ compute 'vector of locally aggregated descriptors' assignments are probabilistically computed """ def encode(k): #diff = data - means[k] if 'rn' in normalize: diff = data - means[k] diff = preprocessing.normalize(diff, norm='l2', copy=False) uk_ = assignments[:, k].T.dot(diff) else: uk_ = assignments[:, k].T.dot(data) # this is equal to: # uk__ = np.zeros( (1, data.shape[1]), dtype=np.float32) # for i in range(len(data)): # uk__ += assignments[i,k] * data[i] clustermass = assignments[:, k].sum() if clustermass > 0: if 'mass' in normalize: uk_ /= clustermass uk_ -= means[k] else: uk_ -= clustermass * means[k] if 'l2c' in normalize: n = max(math.sqrt(np.sum(uk_ * uk_)), 1e-12) uk_ /= n return uk_ if parallel: uk = pc.parmap(encode, range(components)) else: uk = map(encode, range(components)) uk = np.concatenate(uk).reshape(1, -1) return uk # * assignments.sum()
def getPosteriors(gmm, data, parallel=None, theta=0.0, hard_assignment=False, nprocs=None, ratio=1.0): """ compute the posterior probability (assignment) for each sample parameters: gmm: scikit-learn computed gmm data: feature-vectors row-wise parallel: if true it will be computed in parallel theta: posterior threshold, i.e. if eps > 0.0 each posterior < eps will be set to 0 Sanchez et al. use here 1e-4 hard_assignment: if set to true, then 'getAssignment is called with the gmm's means -> much faster than predicting the poseriors """ if hard_assignment: return getAssignment(gmm.means_, data, ratio) if parallel: def predict(split): return gmm.predict_proba(split) splits = np.array_split(data, 8) posteriors = pc.parmap(predict, splits, nprocs) posteriors = np.concatenate(posteriors, axis=0) else: posteriors = gmm.predict_proba(data) if theta > 0.0: # set all posteriors smaller eps to 0 posteriors[posteriors < theta] = 0.0 # re-normalize the posteriors such that they sum up to 1 again posteriors = preprocessing.normalize(posteriors, norm='l1', copy=False) return posteriors
def computeDistances(descriptors, method, parallel, nprocs, distance_func=None): num_desc = len(descriptors) indices = [(y,x) for y in range(num_desc-1) for x in range(y+1, num_desc)] splits = np.array_split(np.array(indices), 8) def loop(inds): dists = [] for ind in inds: if distance_func == None: try: dist = computeDistance(descriptors[ ind[0] ],descriptors[ ind[1] ], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func( descriptors[ ind[0] ],descriptors[ ind[1] ] ) dists.append(dist) return dists if parallel: dists = pc.parmap(loop, splits, nprocs) else: dists = map(loop, splits) # convert densed vector-form to matrix dense_vector = np.concatenate( dists ) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform( dense_vector ) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # fill diagonal elements with max np.fill_diagonal(dist_matrix, np.finfo(float).max) return dist_matrix
print 'img {} is None, path correct? --> skip'.format(img_file) return kpts = fe.detect(img) _, descriptors = fe.extract(img, kpts) if descriptors is None or len(descriptors) == 0: print 'WARNING: no descriptors extracted, skip image', img_file sys.exit(1) # Hellinger normalization descriptors += np.finfo(np.float32).eps descriptors /= np.sum(descriptors, axis=1)[:, np.newaxis] descriptors = np.sqrt(descriptors) # output new_basename = os.path.join( args.outputfolder, os.path.basename(os.path.splitext(img_file)[0])) feat_filename = new_basename + '_' + args.detector \ + '_' + args.feature + '.pkl.gz' with gzip.open(feat_filename, 'wb') as f: cPickle.dump(descriptors, f, -1) progress.update(i + 1) if args.parallel: pc.parmap(compute, range(len(files)), args.nprocs) else: map(compute, range(len(files))) progress.finish()
print 'img {} is None, path correct? --> skip'.format(img_file) return kpts = fe.detect(img) _, descriptors = fe.extract(img, kpts) if descriptors is None or len(descriptors) == 0: print 'WARNING: no descriptors extracted, skip image', img_file sys.exit(1) # Hellinger normalization descriptors += np.finfo(np.float32).eps descriptors /= np.sum(descriptors, axis=1)[:,np.newaxis] descriptors = np.sqrt(descriptors) # output new_basename = os.path.join(args.outputfolder, os.path.basename(os.path.splitext(img_file)[0])) feat_filename = new_basename + '_' + args.detector \ + '_' + args.feature + '.pkl.gz' with gzip.open(feat_filename, 'wb') as f: cPickle.dump(descriptors, f, -1) progress.update(i+1) if args.parallel: pc.parmap(compute, range(len(files)), args.nprocs) else: map(compute, range(len(files))) progress.finish()
def vladHard(data, means, assignments, parallel, components, normalize=['l2c'], covars=None, skew=None, lcs=None): """ compute 'vector of locally aggregated descriptors' for hard assignment only - this way it can be computed faster """ cgmp = False for nm in normalize: if nm.startswith('cgmp'): alpha_str = nm.replace('cgmp', '') if alpha_str == '': raise ValueError('no alpha for cgmp given') alpha = float(alpha_str) cgmp = True break def encode(k): vk_ = None sk_ = None possible = data[assignments[:, k] > 0] clustermass = len(possible) if clustermass > 0: """ 'rn': Delhumeau: Revisiting VLAD ... """ if 'rn' in normalize: diff = possible - means[k] if 'rn' in normalize: diff = preprocessing.normalize(diff, norm='l2', copy=False) else: uk_ = np.sum(diff, axis=0) else: agg = np.sum(possible, axis=0) if 'mass' in normalize: uk_ = agg / clustermass uk_ -= means[k] else: uk_ = agg - clustermass * means[k] enc = [uk_] enc = np.concatenate(enc) if 'l2c' in normalize and clustermass > 0: enc = preprocessing.normalize(enc, norm='l2', copy=False) return enc if parallel: uk = pc.parmap(encode, range(components)) else: uk = map(encode, range(components)) uk = np.concatenate(uk).reshape(1, -1) return uk # * assignments.sum()
def computeIndependentExCls(descr, neg_desc, the_cls, outputfolder=None, suffix='_ecls.pkl.gz', parallel=True, nprocs=None, resampling=0, files=None, load=False, return_none=False, n_cls=-1): """ compute for each descr an exemplar classifier using the descr. of <neg_desc> as negatives, optionally save the classifiers """ print '=> compute independent e-cls' if files is not None: assert (len(files) == len(descr)) print outputfolder, len(files) if files else '', suffix, load if isinstance(the_cls, LDA): fname = os.path.join(outputfolder, 'covinv.pkl.gz') if load and os.path.exists(fname): cov_inv = pc.load(fname) else: # cc = covariance.GraphLassoCV() cc = covariance.ShrunkCovariance() # cc = covariance.LeoditWolf() # cc = covariance.OAS() # cc = covariance.MinCovDet() cc.fit(neg_desc) cov_inv = cc.precision_ # covar = np.cov(neg_desc.T, bias=1) # # regularize # covar[np.diag_indices(len(covar))] += 0.01 # cov_inv = np.linalg.inv(covar) pc.dump(fname, cov_inv, verbose=False) print '| elda: cov_inv.shape:', cov_inv.shape mean = np.mean(neg_desc, axis=0) zero_mean = descr - mean if n_cls is not None and n_cls > 0: indices = np.random.choice(len(neg_desc), min(len(neg_desc), n_cls), replace=False) neg_desc = neg_desc[indices] print 'choose to use {} neg-descr'.format(len(neg_desc)) widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(descr)) def createEx(i): # print 'all.shape:', descr.shape, 'one:', descr[i].shape fname = '' if outputfolder is not None and files is not None: if files[i].endswith('.pkl.gz'): fname = files[i].replace('.pkl.gz', suffix) else: fname = os.path.splitext(files[i])[0] + suffix fname = os.path.join(outputfolder, os.path.basename(fname)) if load and fname != '' and os.path.exists(fname): run = False try: cls = pc.load(fname) assert (cls.__class__.__name__ == the_cls.__class__.__name__) progress.update(i + 1) if return_none: return None return cls except: # e.g. EOFError most of the time print 'Warning: couldnt load {} -> recompute'.format(fname) # print 'compute cls for', os.path.basename(files[i]) if isinstance(the_cls, LDA): cls = copy.deepcopy(the_cls) w = cov_inv.dot(zero_mean[i].T) cls.coef_ = w.reshape(1, -1) cls.intercept_ = 0 #np.zeros( (cls.coef_.shape[0],1) ) else: cls = exemplar_cls.createExemplarCls(descr[i].reshape(1, -1), neg_desc, the_cls, resampling) if fname != '': pc.dump(fname, cls, verbose=False) progress.update(i + 1) if return_none: return None return cls progress.start() if parallel: ex_cls = pc.parmap(createEx, range(len(descr)), nprocs=nprocs) else: ex_cls = map(createEx, range(len(descr))) progress.finish() print '[Done]' return ex_cls
# save encoding filepath = os.path.join(args.outputfolder, base + identifier + '.pkl.gz') with gzip.open(filepath, 'w') as f: cPickle.dump(enc, f, -1) progress.update(i + 1) if args.no_eval: # save some memory return None return enc progress.start() if args.parallel: all_enc = zip(*pc.parmap(encode, range(num_descr), args.nprocs)) else: all_enc = zip(*map(encode, range(num_descr))) progress.finish() print 'got {} encodings'.format(len(all_enc)) if args.no_eval: sys.exit(1) all_enc = np.concatenate(all_enc, axis=0).astype(np.float32) print 'Evaluation:' ret_matrix = evaluate.runNN(all_enc, labels,
def fisherCPU(data_orig, means, weights, posteriors_orig, inv_sqrt_cov, parallel=False, accumulate=True, normalize=[], update='wmc'): components, fd = means.shape def encode(i): data = data_orig[posteriors_orig[:, i] > 0] posteriors = posteriors_orig[posteriors_orig[:, i] > 0, i].reshape(1, -1) clustermass = len(data) diff = (data - means[i]) * inv_sqrt_cov[i] if 'rn' in normalize: diff = preprocessing.normalize(diff, norm='l2', copy=False) if accumulate: #diff = data * inv_sqrt_cov[i] if 'w' in update and clustermass > 0: weights_ = np.sum(posteriors - weights[i]) weights_ /= (len(data) * math.sqrt(weights[i])) else: weights_ = 0 if 'm' in update and clustermass > 0: means_ = posteriors.dot(diff) means_ /= (len(data) * math.sqrt(weights[i])) else: means_ = np.zeros((1, fd), data.dtype) if 'c' in update and clustermass > 0: covs_ = posteriors.dot(diff * diff - 1) covs_ /= (len(data) * math.sqrt(2.0 * weights[i])) else: covs_ = np.zeros((1, fd), data.dtype) else: if 'w' in update: weights_ = posteriors.T - weights[i] weights_ /= math.sqrt(weights[i]) else: weights_ = None if 'm' in update and clustermass > 0: means_ = posteriors.T * diff means_ /= math.sqrt(weights[i]) else: means_ = np.zeros((len(data), fd), data.dtype) if 'c' in update and clustermass > 0: covs_ = posteriors.T * (diff * diff - 1) covs_ /= math.sqrt(2.0 * weights[i]) else: covs_ = np.zeros((len(data), fd), data.dtype) # print 'w:', weights_ # print 'm:', means_ # print 'c:', covs_ # print 'w:', weights_.shape # print 'm:', means_.shape # print 'c:', covs_.shape return weights_, means_, covs_ if parallel: wk_, uk_, vk_ = zip(*pc.parmap(encode, range(components))) else: wk_, uk_, vk_ = zip(*map(encode, range(components))) return wk_, uk_, vk_
update=args.update, relevance=args.relevance ) # save encoding filepath = os.path.join(args.outputfolder, base + identifier + '.pkl.gz') with gzip.open(filepath, 'w') as f: cPickle.dump(enc, f, -1) progress.update(i+1) if args.no_eval: # save some memory return None return enc progress.start() if args.parallel: all_enc = zip( *pc.parmap( encode, range(num_descr), args.nprocs ) ) else: all_enc = zip( *map( encode, range(num_descr) ) ) progress.finish() print 'got {} encodings'.format(len(all_enc)) if args.no_eval: sys.exit(1) all_enc = np.concatenate(all_enc, axis=0).astype(np.float32) print 'Evaluation:' ret_matrix = evaluate.runNN( all_enc , labels, parallel=args.parallel, nprocs=args.nprocs )
def computeDistances(descriptors, method, distance=True, parallel=True, distance_func=None, nprocs=4): num_desc = len(descriptors) if np.isnan(descriptors).any(): raise ValueError('nan in descr!') if np.isinf(descriptors).any(): raise ValueError('inf in descr!') for i in range(len(descriptors)): if not descriptors[i].any(): # faster print 'WARNING: complete row {} is 0'.format(i) indices = [(y, x) for y in range(num_desc - 1) for x in range(y + 1, num_desc)] def loop(ind): if distance_func == None: try: dist = computeDistance(descriptors[ind[0]], descriptors[ind[1]], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func(descriptors[ind[0]], descriptors[ind[1]]) return dist if parallel: dists = pc.parmap(loop, indices, nprocs=nprocs) else: dists = map(loop, indices) dense_vector = np.array(dists, dtype=float) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform(dense_vector) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # do some checks if np.isnan(dist_matrix).any(): print 'WARNING have a nan in the dist-matrix' if np.isinf(dist_matrix).any(): print 'WARNING have a inf in the dist-matrix' if distance: if np.count_nonzero( dist_matrix == np.finfo(dist_matrix.dtype).max) > 0: raise ValueError('there is already a float-maximum') np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).max) else: if np.count_nonzero( dist_matrix == np.finfo(dist_matrix.dtype).min) > 0: raise ValueError('there is already a float-min') np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).min) return dist_matrix #, dist_m
def computeStats(name, dist_matrix, labels_probe, labels_gallery=None, parallel=True, distance=True, nprocs=4, eval_method='cosine'): n_probe, n_gallery = dist_matrix.shape # often enough we make a leave-one-out-cross-validation # here we don't have a separation probe / gallery if labels_gallery is None: n_gallery -= 1 labels_gallery = labels_probe # assert not needed or? assert (dist_matrix.shape[0] == dist_matrix.shape[1]) assert (dist_matrix.shape[0] == len(labels_probe)) assert (dist_matrix.shape[1] == len(labels_gallery)) # TODO: make variables choosable # Tolias et al. 2014 / 2016 if 'poly' in eval_method: alpha = 3 tau = 0 sign = np.sign(dist_matrix) abso = np.abs(dist_matrix) abso = np.pow(abso[dist_matrix > tau], alpha) dist_matrix = sign * abso # Tao et al. 2014 elif 'expo' in eval_method: beta = 10 dist_matrix = np.exp(beta * dist_matrix) ind_probe = len(set(labels_probe)) ind_gall = len(set(labels_gallery)) labels_gallery = np.array(labels_gallery) labels_probe = np.array(labels_probe) print 'number of probes: {}, individuals: {}'.format(n_probe, ind_probe) print 'number of gallery: {}, individuals: {}'.format(n_gallery, ind_probe) if parallel: def sortdist(split): return split.argsort() splits = np.array_split(dist_matrix, 8) # todo assume 8 threads indices = pc.parmap(sortdist, splits, nprocs=nprocs) indices = np.concatenate(indices, axis=0) else: indices = dist_matrix.argsort() if not distance: indices = indices[:, ::-1] def loop_descr(r): rel_list = np.zeros((1, n_gallery)) not_correct = [] for k in range(0, n_gallery): if labels_gallery[indices[r, k]] == labels_probe[r]: rel_list[0, k] = 1 elif k == 1: not_correct.append((r, indices[r, k])) return rel_list, not_correct if parallel: all_rel, top1_fail = zip( *pc.parmap(loop_descr, range(n_probe), nprocs=nprocs)) else: all_rel, top1_fail = zip(*map(loop_descr, range(n_probe))) # make all computations with the rel-matrix rel_conc = np.concatenate(all_rel, 0) # are there any zero rows? z_rows = np.sum(rel_conc, 1) n_real2 = np.count_nonzero(z_rows) if n_real2 != rel_conc.shape[0]: print( 'WARNING: not for each query exist also a label in the gallery' '({} / {})'.format(n_real2, len(rel_conc.shape[0]))) rel_mat = rel_conc[z_rows > 0] print 'rel_mat.shape:', rel_mat.shape prec_mat = np.zeros(rel_mat.shape) soft2 = np.zeros(50) hard2 = np.zeros(4) for i in range(n_gallery): rel_sum = np.sum(rel_mat[:, :i + 1], 1) prec_mat[:, i] = rel_sum / (i + 1) if i < 50: soft2[i] = np.count_nonzero(rel_sum > 0) / float(n_real2) if i < 4: hh = rel_sum[np.isclose(rel_sum, (i + 1))] # print 'i: {} len(hh): {}'.format(i, len(hh)) hard2[i] = len(hh) / float(n_real2) map2 = np.mean(prec_mat[rel_mat == 1]) print 'correct: {} / {}'.format(np.sum(rel_mat[:, 0]), n_real2) print 'map:', map2 print 'top-k soft:', soft2[:10] print 'top-k hard:', hard2 # Average precisions ap = [] for i in range(n_real2): ap.append(np.mean(prec_mat[i][rel_mat[i] == 1])) print 'mean(ap):', np.mean(ap) print 'isclose(map2, mean(ap)): {}'.format(np.isclose(map2, np.mean(ap))) # precision@x scores p2 = np.sum(prec_mat[:, 1]) / n_real2 p3 = np.sum(prec_mat[:, 2]) / n_real2 p4 = np.sum(prec_mat[:, 3]) / n_real2 print 'mean P@2,P@3,P@4:', p2, p3, p4 stats = { 'topx_soft': soft2[:10], 'topx_hard': hard2, 'mAP': map2, 'top1_fail': top1_fail, 'ap': ap, 'p2': p2, 'p3': p3, 'p4': p4 } return stats
# cPickle.dump(cls, fOut, -1) # print 'saved', filename progress.update(i + 1) return cls filename = os.path.join(args.outputfolder, args.clsname + '_all.pkl.gz') if args.load_cls: with gzip.open(filename, 'rb') as f: ex_cls = cPickle.load(f) print 'loaded', filename else: progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files)) progress.start() if args.parallel: ex_cls = pc.parmap(exemplar_classify, range(len(files)), nprocs=args.nprocs) else: ex_cls = map(exemplar_classify, range(len(files))) progress.finish() pc.dump(filename, ex_cls) print 'progress predict' # iteratively predict def multi_predict(i): if args.pq: ex_desc = prep.uncompress(pos_desc[i]) else: ex_desc = pc.loadDescriptors(files[i])
def run(args, prep=None): if prep is None: prep = preprocess.Preprocess() if not args.labelfile or not args.inputfolder \ or not args.outputfolder: print('WARNING: no labelfile or no inputfolder' ' or no outputfolder specified') print 'accumulate features:', args.accumulate if args.outputfolder and not os.path.exists(args.outputfolder): print 'outputfolder doesnt exist -> create' pc.mkdir_p(args.outputfolder) if args.load_scores: print 'try to load computed encodings' ##### # UBM / loading print 'load gmm from', args.load_ubm ubm_gmm = None if args.load_ubm: ubm_gmm = loadGMM(args.load_ubm, args.lib) ##### # Enrollment # now for each feature-set adapt a gmm ##### if args.labelfile is None: print 'WARNING: no label-file' if args.concat_later: args.concat = True if args.concat: groups = None if args.group_word: descriptor_files = pc.getFilesGrouped(args.inputfolder, args.suffix) labels = None else: descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=False, concat=True) print 'labels:', labels[0] if len(descriptor_files) != len(labels): raise ValueError('len(descriptor_files) {} !=' 'len(labels) {}'.format(len(descriptor_files), len(labels))) print 'num descr-files of first:', len(descriptor_files[0]) else: descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile) if args.maskfolder: maskfiles = pc.getMaskFiles(descriptor_files, args.suffix, args.maskfolder, args.masksuffix) if len(descriptor_files) == 0: print 'no descriptor_files' sys.exit(1) if labels: num_scribes = len(list(set(labels))) else: num_scribes = 'unknown' num_descr = len(descriptor_files) print 'number of classes:', num_scribes print 'number of descriptor_files:', num_descr print 'adapt training-features to create individual scribe-gmms (or load saved ones)' widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(descriptor_files)) if 'supervector' in args.encoding: identifier = '_sv' elif 'fisher' in args.encoding: identifier = '_fv' else: identifier = '_' + args.encoding identifier += '_' + args.update if len(args.normalize_enc) > 0: identifier += '_' + '_'.join(args.normalize_enc) encoder = Encoding(args.encoding, ubm_gmm, parallel=False, normalize=args.normalize_enc, update=args.update, relevance=args.relevance, nbest=args.nbest, ratio=args.ratio, accumulate=args.accumulate, nprocs=args.nprocs) if args.posteriors_dir: posterior_files, _ = pc.getFiles(args.posteriors_dir, args.posteriors_suffix, args.labelfile) print len(posterior_files), len(descriptor_files) assert(len(posterior_files) == len(descriptor_files)) cp = os.path.commonprefix(descriptor_files) #print cp def encode(i): if isinstance(descriptor_files[i], basestring): fname = descriptor_files[i] if os.path.isdir(cp): base = os.path.relpath(fname, cp) if fname.endswith('.pkl.gz'): base = base.replace('.pkl.gz','') else: base = os.path.splitext(base)[0] if os.path.isdir(cp): folder = os.path.join(args.outputfolder, os.path.dirname(base)) # print 'should create: {} + {}'.format(args.outputfolder, base) pc.mkdir_p(folder,silent=True) else: base = os.path.basename(os.path.commonprefix(descriptor_files[i])) gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5') gmm = ubm_gmm scribe_gmm = None # load gmm if possible if args.load_gmm: gmm_file = os.path.join(args.load_gmm, gmm_name) scribe_gmm = load_gmm(gmm_file, args.lib) # load encoding if args.load_scores: if args.load_scores == 'outputfolder': load_f = args.outputfolder else: load_f = args.load_scores filepath = os.path.join(load_f, base + identifier + '.pkl.gz') if os.path.exists(filepath): with gzip.open(filepath, 'rb') as f: enc = cPickle.load(f) return enc, None # else: # print ('WARNING: encoding {} doesnt exist, compute' # 'it'.format(filepath )) if args.concat_later: enc = [] for k in range(len(descriptor_files[i])): # load data and preprocess features = pc.loadDescriptors( descriptor_files[i][k], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)) if features is None: print 'features==None' continue features = prep.transform(feature) enc_ = encoder.encode(features) enc.append(enc_) enc = np.concatenate(enc, axis=0) else: # load data and preprocess features = pc.loadDescriptors( descriptor_files[i], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)#, ) posteriors = None if args.posteriors_dir: posteriors = pc.loadDescriptors( posterior_files[i] ) assert(len(posteriors) == len(features)) if not isinstance(features, np.ndarray) and not features: print 'features==None?' progress.update(i+1) return 0.0, None if i == 0: print '0-shape:',features.shape features = prep.transform(features) if i == 0: print '0-shape (possibly after pca):',features.shape if args.maskfolder: sample_weights = pc.loadDescriptors(maskfiles[i]) else: sample_weights = None enc, scribe_gmm = encoder.encode(features, return_gmm=True, sample_weights=sample_weights, posteriors=posteriors, verbose=True if i == 0 else False) if i == 0: print '0-enc-shape', enc.shape if isinstance(sample_weights, np.ndarray): print 'sample-weights shape:', sample_weights.shape # write if args.save_gmm: scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name) if 'bob' in args.lib: scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') ) else: with gzip.open(scribe_gmm_filename, 'wb') as f: cPickle.dump(scribe_gmm, f, -1) pc.verboseprint('wrote', scribe_gmm_filename) progress.update(i+1) if args.pq and args.load_pq: enc = prep.compress(enc, aug=args.aug) # save encoding filepath = os.path.join(args.outputfolder, base + identifier + ('_pq' if\ args.pq else '') + '.pkl.gz') with gzip.open(filepath, 'wb') as f: cPickle.dump(enc, f, -1) progress.update(i+1) if 'nothing' in args.evaluate: return None, None return enc, scribe_gmm progress.start() if args.parallel: all_enc, all_gmms = zip( *pc.parmap( encode, range(num_descr), args.nprocs, size=num_descr) ) else: all_enc, all_gmms = zip( *map( encode, range(num_descr) ) ) progress.finish() if 'nothing' in args.evaluate: print 'nothing to evaluate, exit now' return print 'got {} encodings'.format(len(all_enc)) all_enc = np.concatenate(all_enc, axis=0) #.astype(np.float32) print 'all_enc.shape', all_enc.shape print 'Evaluation:' stats = None ret_matrix = None for eval_method in args.evaluate: ret_matrix, stats = evaluate.runNN( all_enc, labels, distance=True, histogram=False, eval_method=eval_method, parallel=args.parallel, nprocs=args.nprocs) if ret_matrix is None or not isinstance(ret_matrix,np.ndarray): print 'WARNING: ret_matrix is None or not instance of np.ndarray' else: fpath = os.path.join(args.outputfolder, 'dist' + identifier + '_' + eval_method + '.cvs') np.savetxt(fpath, ret_matrix, delimiter=',') print 'saved', fpath return stats
def expectation_tv(T, N, F, S, tv_dim, nmix, ndim, parallel, nprocs): # compute posterior means and covariance matrices of the factors # = latent variables idx_sv = np.arange(nmix).repeat(ndim).reshape(-1) nfiles = N.shape[0] LU = nmix * [np.zeros((tv_dim, tv_dim))] RU = np.zeros((tv_dim, nmix * ndim)) I = np.eye(tv_dim) T_invS = T / S.T # mini-batch #bs = 250 # adjust me bs = 400 # adjust me nbatch = int(nfiles / float(bs) + 0.999) for i in range(nbatch): end = min(nfiles, (i + 1) * bs) N1 = N[i * bs:end] F1 = F[i * bs:end] dim = N1.shape[0] # Ex = np.zeros((tv_dim, dim)) # Exx = np.zeros((tv_dim, tv_dim, dim)) widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=dim) #for ix in range(dim): def posteriors(ix): tmp = T_invS * N1[ix, idx_sv] L = I + tmp.dot(T.T) Cxx = np.linalg.pinv(L) # posterior covariance Cov(x,x) B = T_invS.dot(F1[ix].T).reshape(-1, 1) Ex_ = Cxx.dot(B).reshape(-1, 1) # posterior mean E[x] Exx_ = Cxx + Ex_.dot(Ex_.T) progress.update(ix + 1) return Ex_.reshape((tv_dim, 1)), Exx_.reshape((tv_dim, tv_dim, 1)) progress.start() if parallel: Ex, Exx = zip(*pc.parmap(posteriors, range(dim), nprocs=nprocs)) else: Ex, Exx = zip(*map(posteriors, range(dim))) progress.finish() Ex = np.concatenate(Ex, axis=1) Exx = np.concatenate(Exx, axis=2) RU = RU + Ex.dot(F1) # TODO: parallelize me ? for mix in range(nmix): tmp = Exx * N1[:, mix].T.reshape(1, 1, dim) #tmp_m = octave.get_n(Exx, N1, mix+1, dim) LU[mix] = LU[mix] + np.sum(tmp, axis=2) return LU, RU