def deploy(self, task_key=None, publish=False): """ Trains and saves a classifier for deployment Notes: A deployment consists of the following information * The classifier itself * Information needed to construct the input to the classifier - TODO: can this be encoded as an sklearn pipeline? * Metadata concerning what data the classifier was trained with * PUBLISH TO /media/hdd/PUBLIC/models/pairclf Example: >>> # xdoctest: +REQUIRES(module:wbia_cnn) >>> from wbia.algo.verif.vsone import * # NOQA >>> params = dict(sample_method='random') >>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params) >>> pblm.setup(with_simple=False) >>> task_key = pblm.primary_task_key >>> self = Deployer(dpath='.', pblm=pblm) >>> deploy_info = self.deploy() Ignore: pblm.evaluate_classifiers(with_simple=False) res = pblm.task_combo_res[pblm.primary_task_key]['RF']['learn(sum,glob)'] """ deploy_info = self._make_deploy_info(task_key=task_key) deploy_fname = deploy_info['metadata']['fname'] meta_fname = deploy_fname + self.meta_suffix deploy_fpath = join(self.dpath, deploy_fname) meta_fpath = join(self.dpath, meta_fname) ut.save_json(meta_fpath, deploy_info['metadata']) ut.save_data(deploy_fpath, deploy_info) if publish: user = ut.get_user_name() remote_uri = '{user}@{remote}:{path}'.format(user=user, **self.publish_info) ut.rsync(meta_fpath, remote_uri + '/' + meta_fname) ut.rsync(deploy_fpath, remote_uri + '/' + deploy_fname) return deploy_info
def add_split(dataset, key, idxs): print('[dataset] adding split %r' % (key,)) # Build subset filenames ut.ensuredir(dataset.split_dpath) ext = dataset._ext fmtdict = dict(key=key, ext=ext, size=len(idxs)) fmtstr = dataset.get_split_fmtstr(forward=True) splitset = { type_: join(dataset.split_dpath, fmtstr.format(type_=type_, **fmtdict)) for type_ in ['data', 'labels', 'metadata'] } # Partition data into the subset part_dict = { 'data': dataset.data.take(idxs, axis=0), 'labels': dataset.labels.take(idxs, axis=0), } if dataset.metadata is not None: taker = ut.partial(ut.take, index_list=idxs) part_dict['metadata'] = ut.map_dict_vals(taker, dataset.metadata) # Write splitset data to files for type_ in part_dict.keys(): ut.save_data(splitset[type_], part_dict[type_]) # Register filenames with dataset dataset.fpath_dict[key] = splitset
def save(dataset, data, labels, metadata=None, data_per_label=1): ut.save_data(dataset.data_fpath, data) ut.save_data(dataset.labels_fpath, labels) if metadata is not None: ut.save_data(dataset.metadata_fpath, metadata) else: dataset.fpath_dict['full']['metadata'] = None # cache the data because it is likely going to be used to define a # splitset dataset.subset_data.cache['full'] = data dataset.subset_labels.cache['full'] = labels dataset.subset_metadata.cache['full'] = metadata # Infer the rest of the required data info dataset._info['num_labels'] = len(labels) try: dataset._info['unique_labels'] = np.unique(labels) except ValueError: dataset._info['unique_labels'] = np.nan dataset._info['data_per_label'] = data_per_label ut.save_data(dataset.info_fpath, dataset._info)
def load_oxford_2013(): """ Found this data in README of SMK publication https://hal.inria.fr/hal-00864684/document http://people.rennes.inria.fr/Herve.Jegou/publications.html with download script CommandLine: # Download oxford13 data cd ~/work/Oxford mkdir -p smk_data_iccv_2013 cd smk_data_iccv_2013 wget -nH --cut-dirs=4 -r -Pdata/ ftp://ftp.irisa.fr/local/texmex/corpus/iccv2013/ This dataset has 5063 images wheras 07 has 5062 This dataset seems to contain an extra junk image: ashmolean_000214 # Remember that matlab is 1 indexed! # DONT FORGET TO CONVERT TO 0 INDEXING! """ from yael.ynumpy import fvecs_read from yael.yutils import load_ext import scipy.io import vtool as vt from os.path import join dbdir = ut.truepath('/raid/work/Oxford/') datadir = dbdir + '/smk_data_iccv_2013/data/' # we are not retraining, so this is unused # # Training data descriptors for Paris6k dataset # train_sift_fname = join(datadir, 'paris_sift.uint8') # NOQA # # File storing visual words of Paris6k descriptors used in our ICCV paper # train_vw_fname = join(datadir, 'clust_preprocessed/oxford_train_vw.int32') # Pre-learned quantizer used in ICCV paper (used if docluster=false) codebook_fname = join(datadir, 'clust_preprocessed/oxford_codebook.fvecs') # Files storing descriptors/geometry for Oxford5k dataset test_sift_fname = join(datadir, 'oxford_sift.uint8') test_geom_fname = join(datadir, 'oxford_geom_sift.float') test_nf_fname = join(datadir, 'oxford_nsift.uint32') # File storing visual words of Oxford5k descriptors used in our ICCV paper test_vw_fname = join(datadir, 'clust_preprocessed/oxford_vw.int32') # Ground-truth for Oxford dataset gnd_fname = join(datadir, 'gnd_oxford.mat') oxford_vecs = load_ext(test_sift_fname, ndims=128, verbose=True) oxford_nfeats = load_ext(test_nf_fname, verbose=True) oxford_words = fvecs_read(codebook_fname) oxford_wids = load_ext(test_vw_fname, verbose=True) - 1 test_geom_invV_fname = test_geom_fname + '.invV.pkl' try: all_kpts = ut.load_data(test_geom_invV_fname) logger.info('loaded invV keypoints') except IOError: oxford_kptsZ = load_ext(test_geom_fname, ndims=5, verbose=True) logger.info('converting to invV keypoints') all_kpts = vt.convert_kptsZ_to_kpts(oxford_kptsZ) ut.save_data(test_geom_invV_fname, all_kpts) gnd_ox = scipy.io.loadmat(gnd_fname) imlist = [x[0][0] for x in gnd_ox['imlist']] qx_to_dx = gnd_ox['qidx'] - 1 data_uri_order = imlist query_uri_order = ut.take(data_uri_order, qx_to_dx) offset_list = np.hstack(([0], oxford_nfeats.cumsum())).astype(np.int64) # query_gnd = gnd_ox['gnd'][0][0] # bboxes = query_gnd[0] # qx_to_ok_gtidxs1 = [x[0] for x in query_gnd[1][0]] # qx_to_junk_gtidxs2 = [x[0] for x in query_gnd[2][0]] # # ut.depth_profile(qx_to_gtidxs1) # # ut.depth_profile(qx_to_gtidxs2) assert sum(oxford_nfeats) == len(oxford_vecs) assert offset_list[-1] == len(oxford_vecs) assert len(oxford_wids) == len(oxford_vecs) assert oxford_wids.max() == len(oxford_words) - 1 data = { 'offset_list': offset_list, 'all_kpts': all_kpts, 'all_vecs': oxford_vecs, 'words': oxford_words, 'idx_to_wx': oxford_wids, 'data_uri_order': data_uri_order, 'query_uri_order': query_uri_order, } return data
def load_oxford_2007(): """ Loads data from http://www.robots.ox.ac.uk:5000/~vgg/publications/2007/Philbin07/philbin07.pdf >>> from wbia.algo.smk.script_smk import * # NOQA """ from os.path import join, basename, splitext import pandas as pd import vtool as vt dbdir = ut.truepath('/raid/work/Oxford/') data_fpath0 = join(dbdir, 'data_2007.pkl') if ut.checkpath(data_fpath0): data = ut.load_data(data_fpath0) return data else: word_dpath = join(dbdir, 'word_oxc1_hesaff_sift_16M_1M') _word_fpath_list = ut.ls(word_dpath) imgid_to_word_fpath = { splitext(basename(word_fpath))[0]: word_fpath for word_fpath in _word_fpath_list } readme_fpath = join(dbdir, 'README2.txt') imgid_order = ut.readfrom(readme_fpath).split('\n')[20:-1] imgid_order = imgid_order data_uri_order = [x.replace('oxc1_', '') for x in imgid_order] imgid_to_df = {} for imgid in ut.ProgIter(imgid_order, label='reading kpts'): word_fpath = imgid_to_word_fpath[imgid] row_gen = (map(float, line.strip('\n').split(' ')) for line in ut.read_lines_from(word_fpath)[2:]) rows = [(int(word_id), x, y, e11, e12, e22) for (word_id, x, y, e11, e12, e22) in row_gen] df = pd.DataFrame( rows, columns=['word_id', 'x', 'y', 'e11', 'e12', 'e22']) imgid_to_df[imgid] = df df_list = ut.take(imgid_to_df, imgid_order) nfeat_list = [len(df_) for df_ in df_list] offset_list = [0] + ut.cumsum(nfeat_list) shape = (offset_list[-1], 128) # shape = (16334970, 128) sift_fpath = join(dbdir, 'OxfordSIFTDescriptors', 'feat_oxc1_hesaff_sift.bin') try: file_ = open(sift_fpath, 'rb') with ut.Timer('Reading SIFT binary file'): nbytes = np.prod(shape) all_vecs = np.fromstring(file_.read(nbytes), dtype=np.uint8) all_vecs = all_vecs.reshape(shape) finally: file_.close() kpts_list = [ df_.loc[:, ('x', 'y', 'e11', 'e12', 'e22')].values for df_ in df_list ] wordid_list = [df_.loc[:, 'word_id'].values for df_ in df_list] kpts_Z = np.vstack(kpts_list) idx_to_wx = np.hstack(wordid_list) # assert len(np.unique(idx_to_wx)) == 1E6 # Reqd standard query order query_files = sorted( ut.glob(dbdir + '/oxford_groundtruth', '*_query.txt')) query_uri_order = [] for qpath in query_files: text = ut.readfrom(qpath, verbose=0) query_uri = text.split(' ')[0].replace('oxc1_', '') query_uri_order.append(query_uri) logger.info('converting to invV') all_kpts = vt.convert_kptsZ_to_kpts(kpts_Z) data = { 'offset_list': offset_list, 'all_kpts': all_kpts, 'all_vecs': all_vecs, 'idx_to_wx': idx_to_wx, 'data_uri_order': data_uri_order, 'query_uri_order': query_uri_order, } ut.save_data(data_fpath0, data) return data
def merge_datasets(dataset_list): """ Merges a list of dataset objects into a single combined dataset. """ def consensus_check_factory(): """ Returns a temporary function used to check that all incoming values with the same key are consistent """ from collections import defaultdict past_values = defaultdict(lambda: None) def consensus_check(value, key): assert past_values[key] is None or past_values[key] == value, ( 'key=%r with value=%r does not agree with past_value=%r' % (key, value, past_values[key])) past_values[key] = value return value return consensus_check total_num_labels = 0 total_num_data = 0 input_alias_list = [dataset.alias_key for dataset in dataset_list] alias_key = 'combo_' + ut.hashstr27(repr(input_alias_list), hashlen=8) training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training', alias_key) data_fpath = ut.unixjoin(training_dpath, alias_key + '_data.hdf5') labels_fpath = ut.unixjoin(training_dpath, alias_key + '_labels.hdf5') try: # Try and short circut cached loading merged_dataset = DataSet.from_alias_key(alias_key) return merged_dataset except (Exception, AssertionError) as ex: ut.printex(ex, 'alias definitions have changed. alias_key=%r' % (alias_key, ), iswarning=True) # Build the dataset consensus_check = consensus_check_factory() for dataset in dataset_list: print(ut.get_file_nBytes_str(dataset.data_fpath)) print(dataset.data_fpath_dict['full']) print(dataset.num_labels) print(dataset.data_per_label) total_num_labels += dataset.num_labels total_num_data += (dataset.data_per_label * dataset.num_labels) # check that all data_dims agree data_shape = consensus_check(dataset.data_shape, 'data_shape') data_per_label = consensus_check(dataset.data_per_label, 'data_per_label') # hack record this import numpy as np data_dtype = np.uint8 label_dtype = np.int32 data = np.empty((total_num_data, ) + data_shape, dtype=data_dtype) labels = np.empty(total_num_labels, dtype=label_dtype) #def iterable_assignment(): # pass data_left = 0 data_right = None labels_left = 0 labels_right = None for dataset in ut.ProgressIter(dataset_list, lbl='combining datasets', freq=1): X_all, y_all = dataset.subset('full') labels_right = labels_left + y_all.shape[0] data_right = data_left + X_all.shape[0] data[data_left:data_right] = X_all labels[labels_left:labels_right] = y_all data_left = data_right labels_left = labels_right ut.save_data(data_fpath, data) ut.save_data(labels_fpath, labels) labels = ut.load_data(labels_fpath) num_labels = len(labels) merged_dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=None, training_dpath=training_dpath, data_shape=data_shape, data_per_label=data_per_label, output_dims=1, num_labels=num_labels, ) return merged_dataset
def grab_liberty_siam_dataset(pairs=250000): """ References: http://www.cs.ubc.ca/~mbrown/patchdata/patchdata.html https://github.com/osdf/datasets/blob/master/patchdata/dataset.py Notes: "info.txt" contains the match information Each row of info.txt corresponds corresponds to a separate patch, with the patches ordered from left to right and top to bottom in each bitmap image. 3 types of metadata files info.txt - contains patch ids that correspond with the order of patches in the bmp images In the format: pointid, unused interest.txt - interest points corresponding to patches with patchids has same number of rows as info.txt In the format: reference image id, x, y, orientation, scale (in log2 units) m50_<d>_<d>_0.txt - matches files patchID1 3DpointID1 unused1 patchID2 3DpointID2 unused2 CommandLine: python -m ibeis_cnn.ingest_data --test-grab_liberty_siam_dataset --show Example: >>> # ENABLE_DOCTEST >>> from ibeis_cnn.ingest_data import * # NOQA >>> pairs = 500 >>> dataset = grab_liberty_siam_dataset(pairs) >>> ut.quit_if_noshow() >>> from ibeis_cnn import draw_results >>> #ibsplugin.rrr() >>> flat_metadata = {} >>> data, labels = dataset.subset('full') >>> ut.quit_if_noshow() >>> warped_patch1_list = data[::2] >>> warped_patch2_list = data[1::2] >>> dataset.interact() >>> ut.show_if_requested() """ datakw = { 'detector': 'dog', 'pairs': pairs, } assert datakw['detector'] in ['dog', 'harris'] assert pairs in [500, 50000, 100000, 250000] liberty_urls = { 'dog': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty.zip', 'harris': 'http://www.cs.ubc.ca/~mbrown/patchdata/liberty_harris.zip', } url = liberty_urls[datakw['detector']] ds_path = ut.grab_zipped_url(url) ds_name = splitext(basename(ds_path))[0] alias_key = 'liberty;' + ut.dict_str(datakw, nl=False, explicit=True) cfgstr = ','.join([str(val) for key, val in ut.iteritems_sorted(datakw)]) # TODO: allow a move of the base data prefix training_dpath = ut.ensure_app_resource_dir('ibeis_cnn', 'training', ds_name) if ut.get_argflag('--vtd'): ut.vd(training_dpath) ut.ensuredir(training_dpath) data_fpath = join(training_dpath, 'liberty_data_' + cfgstr + '.pkl') labels_fpath = join(training_dpath, 'liberty_labels_' + cfgstr + '.pkl') if not ut.checkpath(data_fpath, verbose=True): data, labels = ingest_helpers.extract_liberty_style_patches( ds_path, pairs) ut.save_data(data_fpath, data) ut.save_data(labels_fpath, labels) # hack for caching num_labels labels = ut.load_data(labels_fpath) num_labels = len(labels) dataset = DataSet.new_training_set( alias_key=alias_key, data_fpath=data_fpath, labels_fpath=labels_fpath, metadata_fpath=None, training_dpath=training_dpath, data_shape=(64, 64, 1), data_per_label=2, output_dims=1, num_labels=num_labels, ) return dataset