def proj(i): # n_samples x n_features if not isinstance(args.inputfolder, basestring) and \ len(args.inputfolder) > 1 or args.inputfolders_suffix != '': cur_data = pc.loadMultipleDescriptors(files[i]) if i == 0: print 'loaded descs of', files[i] print 'shape:', cur_data.shape else: cur_data = pc.loadDescriptors(files[i]) if args.mode == 'fit': prep.partial_fit(cur_data) progress.update(i+1) return else: if i == 0: print 'before:' print cur_data[0] print cur_data.shape, cur_data.dtype cur_data = prep.transform(cur_data) if i == 0: print 'after:' print cur_data[0,0:min(128,cur_data.shape[1])] print cur_data.shape, cur_data.dtype fname = files[i] if isinstance(files[i], basestring)\ else files[i][0] if os.path.isdir(cp): fname = os.path.relpath(fname, cp) if fname.endswith('.pkl.gz'): name = fname.replace('.pkl.gz','') else: name = os.path.splitext(fname)[0] if os.path.isdir(cp): pc.mkdir_p(os.path.join(args.outputfolder, os.path.dirname(name)), silent=True) name = os.path.join(args.outputfolder, name + '_pr.pkl.gz') # print fname, '-->', name with gzip.open(name, 'wb') as F: cPickle.dump(cur_data, F, -1) progress.update(i+1)
help='points to the cluster file') parser.add_argument('--ratio', type=float, help='max ratio 1st to 2nd nearest cluster') return parser if __name__ == '__main__': parser = argparse.ArgumentParser(description='Clustering - Index') parser = pc.commonArguments(parser) parser = addArguments(parser) args = parser.parse_args() np.random.seed(42) if not os.path.exists(args.outputfolder): pc.mkdir_p(args.outputfolder) print args.max_descriptors files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, concat=True) print 'n-files:', len(files) le = preprocessing.LabelEncoder() labels = le.fit_transform(labels) desc_files, _ = pc.getFiles(args.df, args.ds, args.labelfile, concat=True) kmeans = pc.load(args.cluster)
def encode(i): if isinstance(descriptor_files[i], basestring): fname = descriptor_files[i] if os.path.isdir(cp): base = os.path.relpath(fname, cp) if fname.endswith('.pkl.gz'): base = base.replace('.pkl.gz','') else: base = os.path.splitext(base)[0] if os.path.isdir(cp): folder = os.path.join(args.outputfolder, os.path.dirname(base)) # print 'should create: {} + {}'.format(args.outputfolder, base) pc.mkdir_p(folder,silent=True) else: base = os.path.basename(os.path.commonprefix(descriptor_files[i])) gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5') gmm = ubm_gmm scribe_gmm = None # load gmm if possible if args.load_gmm: gmm_file = os.path.join(args.load_gmm, gmm_name) scribe_gmm = load_gmm(gmm_file, args.lib) # load encoding if args.load_scores: if args.load_scores == 'outputfolder': load_f = args.outputfolder else: load_f = args.load_scores filepath = os.path.join(load_f, base + identifier + '.pkl.gz') if os.path.exists(filepath): with gzip.open(filepath, 'rb') as f: enc = cPickle.load(f) return enc, None # else: # print ('WARNING: encoding {} doesnt exist, compute' # 'it'.format(filepath )) if args.concat_later: enc = [] for k in range(len(descriptor_files[i])): # load data and preprocess features = pc.loadDescriptors( descriptor_files[i][k], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)) if features is None: print 'features==None' continue features = prep.transform(feature) enc_ = encoder.encode(features) enc.append(enc_) enc = np.concatenate(enc, axis=0) else: # load data and preprocess features = pc.loadDescriptors( descriptor_files[i], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)#, ) posteriors = None if args.posteriors_dir: posteriors = pc.loadDescriptors( posterior_files[i] ) assert(len(posteriors) == len(features)) if not isinstance(features, np.ndarray) and not features: print 'features==None?' progress.update(i+1) return 0.0, None if i == 0: print '0-shape:',features.shape features = prep.transform(features) if i == 0: print '0-shape (possibly after pca):',features.shape if args.maskfolder: sample_weights = pc.loadDescriptors(maskfiles[i]) else: sample_weights = None enc, scribe_gmm = encoder.encode(features, return_gmm=True, sample_weights=sample_weights, posteriors=posteriors, verbose=True if i == 0 else False) if i == 0: print '0-enc-shape', enc.shape if isinstance(sample_weights, np.ndarray): print 'sample-weights shape:', sample_weights.shape # write if args.save_gmm: scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name) if 'bob' in args.lib: scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') ) else: with gzip.open(scribe_gmm_filename, 'wb') as f: cPickle.dump(scribe_gmm, f, -1) pc.verboseprint('wrote', scribe_gmm_filename) progress.update(i+1) if args.pq and args.load_pq: enc = prep.compress(enc, aug=args.aug) # save encoding filepath = os.path.join(args.outputfolder, base + identifier + ('_pq' if\ args.pq else '') + '.pkl.gz') with gzip.open(filepath, 'wb') as f: cPickle.dump(enc, f, -1) progress.update(i+1) if 'nothing' in args.evaluate: return None, None return enc, scribe_gmm
def run(args, prep=None): if prep is None: prep = preprocess.Preprocess() if not args.labelfile or not args.inputfolder \ or not args.outputfolder: print('WARNING: no labelfile or no inputfolder' ' or no outputfolder specified') print 'accumulate features:', args.accumulate if args.outputfolder and not os.path.exists(args.outputfolder): print 'outputfolder doesnt exist -> create' pc.mkdir_p(args.outputfolder) if args.load_scores: print 'try to load computed encodings' ##### # UBM / loading print 'load gmm from', args.load_ubm ubm_gmm = None if args.load_ubm: ubm_gmm = loadGMM(args.load_ubm, args.lib) ##### # Enrollment # now for each feature-set adapt a gmm ##### if args.labelfile is None: print 'WARNING: no label-file' if args.concat_later: args.concat = True if args.concat: groups = None if args.group_word: descriptor_files = pc.getFilesGrouped(args.inputfolder, args.suffix) labels = None else: descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=False, concat=True) print 'labels:', labels[0] if len(descriptor_files) != len(labels): raise ValueError('len(descriptor_files) {} !=' 'len(labels) {}'.format(len(descriptor_files), len(labels))) print 'num descr-files of first:', len(descriptor_files[0]) else: descriptor_files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile) if args.maskfolder: maskfiles = pc.getMaskFiles(descriptor_files, args.suffix, args.maskfolder, args.masksuffix) if len(descriptor_files) == 0: print 'no descriptor_files' sys.exit(1) if labels: num_scribes = len(list(set(labels))) else: num_scribes = 'unknown' num_descr = len(descriptor_files) print 'number of classes:', num_scribes print 'number of descriptor_files:', num_descr print 'adapt training-features to create individual scribe-gmms (or load saved ones)' widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(descriptor_files)) if 'supervector' in args.encoding: identifier = '_sv' elif 'fisher' in args.encoding: identifier = '_fv' else: identifier = '_' + args.encoding identifier += '_' + args.update if len(args.normalize_enc) > 0: identifier += '_' + '_'.join(args.normalize_enc) encoder = Encoding(args.encoding, ubm_gmm, parallel=False, normalize=args.normalize_enc, update=args.update, relevance=args.relevance, nbest=args.nbest, ratio=args.ratio, accumulate=args.accumulate, nprocs=args.nprocs) if args.posteriors_dir: posterior_files, _ = pc.getFiles(args.posteriors_dir, args.posteriors_suffix, args.labelfile) print len(posterior_files), len(descriptor_files) assert(len(posterior_files) == len(descriptor_files)) cp = os.path.commonprefix(descriptor_files) #print cp def encode(i): if isinstance(descriptor_files[i], basestring): fname = descriptor_files[i] if os.path.isdir(cp): base = os.path.relpath(fname, cp) if fname.endswith('.pkl.gz'): base = base.replace('.pkl.gz','') else: base = os.path.splitext(base)[0] if os.path.isdir(cp): folder = os.path.join(args.outputfolder, os.path.dirname(base)) # print 'should create: {} + {}'.format(args.outputfolder, base) pc.mkdir_p(folder,silent=True) else: base = os.path.basename(os.path.commonprefix(descriptor_files[i])) gmm_name = base + ('_gmm.pkl.gz' if not 'bob' in args.lib else '_gmm_bob.hdf5') gmm = ubm_gmm scribe_gmm = None # load gmm if possible if args.load_gmm: gmm_file = os.path.join(args.load_gmm, gmm_name) scribe_gmm = load_gmm(gmm_file, args.lib) # load encoding if args.load_scores: if args.load_scores == 'outputfolder': load_f = args.outputfolder else: load_f = args.load_scores filepath = os.path.join(load_f, base + identifier + '.pkl.gz') if os.path.exists(filepath): with gzip.open(filepath, 'rb') as f: enc = cPickle.load(f) return enc, None # else: # print ('WARNING: encoding {} doesnt exist, compute' # 'it'.format(filepath )) if args.concat_later: enc = [] for k in range(len(descriptor_files[i])): # load data and preprocess features = pc.loadDescriptors( descriptor_files[i][k], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)) if features is None: print 'features==None' continue features = prep.transform(feature) enc_ = encoder.encode(features) enc.append(enc_) enc = np.concatenate(enc, axis=0) else: # load data and preprocess features = pc.loadDescriptors( descriptor_files[i], min_descs_per_file=args.min_descs, show_progress=(False if\ args.concat else True)#, ) posteriors = None if args.posteriors_dir: posteriors = pc.loadDescriptors( posterior_files[i] ) assert(len(posteriors) == len(features)) if not isinstance(features, np.ndarray) and not features: print 'features==None?' progress.update(i+1) return 0.0, None if i == 0: print '0-shape:',features.shape features = prep.transform(features) if i == 0: print '0-shape (possibly after pca):',features.shape if args.maskfolder: sample_weights = pc.loadDescriptors(maskfiles[i]) else: sample_weights = None enc, scribe_gmm = encoder.encode(features, return_gmm=True, sample_weights=sample_weights, posteriors=posteriors, verbose=True if i == 0 else False) if i == 0: print '0-enc-shape', enc.shape if isinstance(sample_weights, np.ndarray): print 'sample-weights shape:', sample_weights.shape # write if args.save_gmm: scribe_gmm_filename = os.path.join(args.outputfolder, gmm_name) if 'bob' in args.lib: scribe_gmm.save( bob.io.HDF5File(scribe_gmm_filename, 'w') ) else: with gzip.open(scribe_gmm_filename, 'wb') as f: cPickle.dump(scribe_gmm, f, -1) pc.verboseprint('wrote', scribe_gmm_filename) progress.update(i+1) if args.pq and args.load_pq: enc = prep.compress(enc, aug=args.aug) # save encoding filepath = os.path.join(args.outputfolder, base + identifier + ('_pq' if\ args.pq else '') + '.pkl.gz') with gzip.open(filepath, 'wb') as f: cPickle.dump(enc, f, -1) progress.update(i+1) if 'nothing' in args.evaluate: return None, None return enc, scribe_gmm progress.start() if args.parallel: all_enc, all_gmms = zip( *pc.parmap( encode, range(num_descr), args.nprocs, size=num_descr) ) else: all_enc, all_gmms = zip( *map( encode, range(num_descr) ) ) progress.finish() if 'nothing' in args.evaluate: print 'nothing to evaluate, exit now' return print 'got {} encodings'.format(len(all_enc)) all_enc = np.concatenate(all_enc, axis=0) #.astype(np.float32) print 'all_enc.shape', all_enc.shape print 'Evaluation:' stats = None ret_matrix = None for eval_method in args.evaluate: ret_matrix, stats = evaluate.runNN( all_enc, labels, distance=True, histogram=False, eval_method=eval_method, parallel=args.parallel, nprocs=args.nprocs) if ret_matrix is None or not isinstance(ret_matrix,np.ndarray): print 'WARNING: ret_matrix is None or not instance of np.ndarray' else: fpath = os.path.join(args.outputfolder, 'dist' + identifier + '_' + eval_method + '.cvs') np.savetxt(fpath, ret_matrix, delimiter=',') print 'saved', fpath return stats
def runHelper(prep, args): if not os.path.exists(args.outputfolder): pc.mkdir_p(args.outputfolder) files, labels = pc.getFiles(args.inputfolder, args.suffix, labelfile=args.labelfile, exact=args.exact, inputfolders_suffix=args.inputfolders_suffix, max_files=args.max_files) print 'process {} files'.format(len(files)) widgets = [progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()] if args.load_all_features: cur_data, index_list = pc.loadDescriptors(files, max_descs=args.max_descriptors[0]\ if args.max_descriptors\ else 0, return_index_list=True) # per descriptor labels: if len(index_list)-1 != len(labels): raise ValueError('{} != {} + 1'.format(len(index_list), len(labels))) le = preprocessing.LabelEncoder() labels = le.fit_transform(labels) desc_labels = np.zeros( len(cur_data), dtype=np.uint32) for r in xrange(len(labels)): desc_labels[index_list[r]:index_list[r+1]] = labels[r] print 'loaded all', cur_data.shape if 'transform' in args.mode and args.mode != 'fit_transform': print 'first feature before:', cur_data[0] print 'dimension before:', cur_data.shape[1], cur_data.dtype cur_data = prep.transform(cur_data) print 'first feature after:', cur_data[0] print 'dimension after:', cur_data.shape[1], cur_data.dtype if 'fit' in args.mode: if 'transform' in args.mode and args.strip_aug: prep.strip_aug = False prep.fit(cur_data, labels=desc_labels) if args.mode == 'fit_transform': cur_data = prep.transform(cur_data) else: progress = progressbar.ProgressBar(widgets=widgets, maxval=len(files)) if any(isinstance(f, tuple) for f in files): files1 = [f for f in zip(*files)[0]] cp = os.path.commonprefix(files1) else: cp = os.path.commonprefix(files) def proj(i): # n_samples x n_features if not isinstance(args.inputfolder, basestring) and \ len(args.inputfolder) > 1 or args.inputfolders_suffix != '': cur_data = pc.loadMultipleDescriptors(files[i]) if i == 0: print 'loaded descs of', files[i] print 'shape:', cur_data.shape else: cur_data = pc.loadDescriptors(files[i]) if args.mode == 'fit': prep.partial_fit(cur_data) progress.update(i+1) return else: if i == 0: print 'before:' print cur_data[0] print cur_data.shape, cur_data.dtype cur_data = prep.transform(cur_data) if i == 0: print 'after:' print cur_data[0,0:min(128,cur_data.shape[1])] print cur_data.shape, cur_data.dtype fname = files[i] if isinstance(files[i], basestring)\ else files[i][0] if os.path.isdir(cp): fname = os.path.relpath(fname, cp) if fname.endswith('.pkl.gz'): name = fname.replace('.pkl.gz','') else: name = os.path.splitext(fname)[0] if os.path.isdir(cp): pc.mkdir_p(os.path.join(args.outputfolder, os.path.dirname(name)), silent=True) name = os.path.join(args.outputfolder, name + '_pr.pkl.gz') # print fname, '-->', name with gzip.open(name, 'wb') as F: cPickle.dump(cur_data, F, -1) progress.update(i+1) progress.start() # FIXME: np.dot (e.g. used for (R)PCA) doesnt work in parallel atm # if args.parallel: # pc.parmap(proj, range(len(files)), args.nprocs) # else: map(proj, range(len(files))) progress.finish() prep.save_trafos(args.outputfolder)
'part, not the kmeans-initialization part)') parser.add_argument('--update', default='wmc',\ help='what to update w. GMM, w:weights, m:means, c:covars') parser.add_argument('--covar_type', default='diag', choices=['full','diag'], help='covariance type for gmm') return parser if __name__ == '__main__': parser = argparse.ArgumentParser(description="Clustering - Create vocabulary") parser = pc.commonArguments(parser) parser = parserArguments(parser) args = parser.parse_args() if not os.path.exists(args.outputfolder): pc.mkdir_p(args.outputfolder) files,_ = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=True) if not files or len(files) == 0: print 'getFiles() returned no images' sys.exit(1) # load features to train a universal background gmm print 'load features for training ubm from {} files'.format(len(files)) descriptors = pc.loadDescriptors(files,\ max_descs=args.max_descriptors[0], max_descs_per_file=max(int(args.max_descriptors[0]/len(files)),\ 1), rand=True, \
def run(args, prep=None): if prep == None: prep = preprocess.Preprocess() if not os.path.exists(args.outputfolder): pc.mkdir_p(args.outputfolder) files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=args.exact, max_files=args.max_files) if files is None or len(files) == 0: print 'getFiles() returned no images' sys.exit(1) maskfiles = pc.getMaskFiles(files, args.suffix, args.maskfolder, args.masksuffix) if len(args.max_descriptors) == 0: descriptors, rand_indices = pc.loadDescriptors( files, rand=True, return_random_indices=True) else: max_descs_per_file = int(args.max_descriptors[0] / float(len(files))) max_descs_per_file = max(max_descs_per_file, 1) descriptors, rand_indices = pc.loadDescriptors(files,\ max_descs=args.max_descriptors[0], max_descs_per_file=max_descs_per_file, rand=True, maskfiles=maskfiles, return_random_indices=True) print 'got {} features'.format(len(descriptors)) print 'features.shape', descriptors.shape # load features to train a universal background gmm print 'load features for training ubm from {} files'.format(len(files)) if args.method == 'posteriors': posteriors_files, _ = pc.getFiles(args.posteriors_dir, args.posteriors_suffix, labelfile=args.labelfile, exact=args.exact, max_files=args.max_files) assert (len(posteriors_files) == len(files)) indices = [] widgets = [ progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(posteriors_files)) progress.start() for e, f in enumerate(posteriors_files): posteriors = pc.loadDescriptors(f) posteriors = posteriors[rand_indices[e]] cluster_idx = posteriors.argmax(axis=1) indices.append(cluster_idx) progress.update(e + 1) progress.finish() indices = np.concatenate(indices) assert (len(indices) == len(descriptors)) means = recomputeMeans(descriptors, indices) vocabulary = cluster.KMeans(means.shape[0]) # dummy vocabulary.means_ = means vocabulary.type_ = 'kmeans' else: vocabulary = computeVocabulary(descriptors, args.method, args.num_clusters, args.iterations, args.gmm_update, args.lib, args.covar_type, args.nprocs) # TODO: rewrite to be more generic if 'sparse' in args.method and 'gmm' in args.method: gmm = mixture.GMM(args.num_clusters, n_iter=args.iterations, params=args.gmm_update, init_params='wc') gmm.means_ = vocabulary.reshape(args.num_clusters, -1) gmm.fit(descriptors) vocabulary = gmm if args.predict: pred = vocabulary.predict(descriptors) pred_prob = None if 'predict_proba' in dir(vocabulary): pred_prob = vocabulary.predict_proba(descriptors) for i, f in enumerate(files): if pred_prob: print '{}\t[{}], ([{}])'.format(os.path.basename(f), pred[i], pred_prob[i]) else: print '{}\t[{}]'.format(os.path.basename(f), pred[i]) # save gmm voc_filepath = os.path.join( args.outputfolder, (args.vocabulary_filename if args.vocabulary_filename != None else args.method) + 'pkl.gz') with gzip.open(voc_filepath, 'wb') as f: cPickle.dump(vocabulary, f, -1) print 'saved vocabulary at', voc_filepath if args.method == 'gmm': try: aic = vocabulary.aic(descriptors) print 'aic:', aic with open(os.path.join(args.outputfolder, 'aic.txt'), 'a') as f: f.write('{}\n'.format(aic)) except: raise # print('couldnt compute aic, error: {}'.format(e)) return os.path.abspath(voc_filepath)