def stack_graphs(graph_list, vert=False, pad=None): graph_list_ = [g.copy() for g in graph_list] for g in graph_list_: translate_graph_to_origin(g) bbox_list = [ut.get_graph_bounding_box(g) for g in graph_list_] if vert: dim1 = 3 dim2 = 2 else: dim1 = 2 dim2 = 3 dim1_list = np.array([bbox[dim1] for bbox in bbox_list]) dim2_list = np.array([bbox[dim2] for bbox in bbox_list]) if pad is None: pad = np.mean(dim1_list) / 2 offset1_list = ut.cumsum([0] + [d + pad for d in dim1_list[:-1]]) max_dim2 = max(dim2_list) offset2_list = [(max_dim2 - d2) / 2 for d2 in dim2_list] if vert: t_xy_list = [(d2, d1) for d1, d2 in zip(offset1_list, offset2_list)] else: t_xy_list = [(d1, d2) for d1, d2 in zip(offset1_list, offset2_list)] for g, t_xy in zip(graph_list_, t_xy_list): ut.translate_graph(g, t_xy) nx.set_node_attributes(g, name='pin', values='true') new_graph = nx.compose_all(graph_list_) #pt.show_nx(new_graph, layout='custom', node_labels=False, as_directed=False) # NOQA return new_graph
def accumulate_input_ids(edge_list): """ python -m dtool.example_depcache2 testdata_depc4 --show """ edge_data = ut.take_column(edge_list, 3) # We are accumulating local input ids toaccum_list_ = ut.dict_take_column(edge_data, 'local_input_id') if BIG_HACK and True: v_list = ut.take_column(edge_list, 1) # show the local_input_ids at the entire level pred_ids = ([[ x['local_input_id'] for x in list(graph.pred[node].values())[0].values() ] if len(graph.pred[node]) else [] for node in v_list]) toaccum_list = [ x + ':' + ';'.join(y) for x, y in zip(toaccum_list_, pred_ids) ] else: toaccum_list = toaccum_list_ # Default dumb accumulation accum_ids_ = ut.cumsum(zip(toaccum_list), tuple()) accum_ids = ut.lmap(condense_accum_ids, accum_ids_) if BIG_HACK: accum_ids = ut.lmap(condense_accum_ids_stars, accum_ids) accum_ids = [('t', ) + x for x in accum_ids] ut.dict_set_column(edge_data, 'accum_id', accum_ids) return accum_ids
def update(self, iterable): _lists = self._lists values = list(iterable) _load = self._load _lists.extend(values[pos:(pos + _load)] for pos in range(0, len(values), _load)) self._cumlen = ut.cumsum(map(len, _lists)) self._len = len(values)
def run_asmk_script(): with ut.embed_on_exception_context: # NOQA """ >>> from wbia.algo.smk.script_smk import * """ # NOQA # ============================================== # PREPROCESSING CONFIGURATION # ============================================== config = { # 'data_year': 2013, 'data_year': None, 'dtype': 'float32', # 'root_sift': True, 'root_sift': False, # 'centering': True, 'centering': False, 'num_words': 2**16, # 'num_words': 1E6 # 'num_words': 8000, 'kmeans_impl': 'sklearn.mini', 'extern_words': False, 'extern_assign': False, 'assign_algo': 'kdtree', 'checks': 1024, 'int_rvec': True, 'only_xy': False, } # Define which params are relevant for which operations relevance = {} relevance['feats'] = ['dtype', 'root_sift', 'centering', 'data_year'] relevance['words'] = relevance['feats'] + [ 'num_words', 'extern_words', 'kmeans_impl', ] relevance['assign'] = relevance['words'] + [ 'checks', 'extern_assign', 'assign_algo', ] # relevance['ydata'] = relevance['assign'] + ['int_rvec'] # relevance['xdata'] = relevance['assign'] + ['only_xy', 'int_rvec'] nAssign = 1 class SMKCacher(ut.Cacher): def __init__(self, fname, ext='.cPkl'): relevant_params = relevance[fname] relevant_cfg = ut.dict_subset(config, relevant_params) cfgstr = ut.get_cfg_lbl(relevant_cfg) dbdir = ut.truepath('/raid/work/Oxford/') super(SMKCacher, self).__init__(fname, cfgstr, cache_dir=dbdir, ext=ext) # ============================================== # LOAD DATASET, EXTRACT AND POSTPROCESS FEATURES # ============================================== if config['data_year'] == 2007: data = load_oxford_2007() elif config['data_year'] == 2013: data = load_oxford_2013() elif config['data_year'] is None: data = load_oxford_wbia() offset_list = data['offset_list'] all_kpts = data['all_kpts'] raw_vecs = data['all_vecs'] query_uri_order = data['query_uri_order'] data_uri_order = data['data_uri_order'] # del data # ================ # PRE-PROCESS # ================ import vtool as vt # Alias names to avoid errors in interactive sessions proc_vecs = raw_vecs del raw_vecs feats_cacher = SMKCacher('feats', ext='.npy') all_vecs = feats_cacher.tryload() if all_vecs is None: if config['dtype'] == 'float32': logger.info('Converting vecs to float32') proc_vecs = proc_vecs.astype(np.float32) else: proc_vecs = proc_vecs raise NotImplementedError('other dtype') if config['root_sift']: with ut.Timer('Apply root sift'): np.sqrt(proc_vecs, out=proc_vecs) vt.normalize(proc_vecs, ord=2, axis=1, out=proc_vecs) if config['centering']: with ut.Timer('Apply centering'): mean_vec = np.mean(proc_vecs, axis=0) # Center and then re-normalize np.subtract(proc_vecs, mean_vec[None, :], out=proc_vecs) vt.normalize(proc_vecs, ord=2, axis=1, out=proc_vecs) if config['dtype'] == 'int8': smk_funcs all_vecs = proc_vecs feats_cacher.save(all_vecs) del proc_vecs # ===================================== # BUILD VISUAL VOCABULARY # ===================================== if config['extern_words']: words = data['words'] assert config['num_words'] is None or len( words) == config['num_words'] else: word_cacher = SMKCacher('words') words = word_cacher.tryload() if words is None: with ut.embed_on_exception_context: if config['kmeans_impl'] == 'sklearn.mini': import sklearn.cluster rng = np.random.RandomState(13421421) # init_size = int(config['num_words'] * 8) init_size = int(config['num_words'] * 4) # converged after 26043 iterations clusterer = sklearn.cluster.MiniBatchKMeans( config['num_words'], init_size=init_size, batch_size=1000, compute_labels=False, max_iter=20, random_state=rng, n_init=1, verbose=1, ) clusterer.fit(all_vecs) words = clusterer.cluster_centers_ elif config['kmeans_impl'] == 'yael': from yael import ynumpy centroids, qerr, dis, assign, nassign = ynumpy.kmeans( all_vecs, config['num_words'], init='kmeans++', verbose=True, output='all', ) words = centroids word_cacher.save(words) # ===================================== # ASSIGN EACH VECTOR TO ITS NEAREST WORD # ===================================== if config['extern_assign']: assert config[ 'extern_words'], 'need extern cluster to extern assign' idx_to_wxs = vt.atleast_nd(data['idx_to_wx'], 2) idx_to_maws = np.ones(idx_to_wxs.shape, dtype=np.float32) idx_to_wxs = np.ma.array(idx_to_wxs) idx_to_maws = np.ma.array(idx_to_maws) else: from wbia.algo.smk import vocab_indexer vocab = vocab_indexer.VisualVocab(words) dassign_cacher = SMKCacher('assign') assign_tup = dassign_cacher.tryload() if assign_tup is None: vocab.flann_params['algorithm'] = config['assign_algo'] vocab.build() # Takes 12 minutes to assign jegous vecs to 2**16 vocab with ut.Timer('assign vocab neighbors'): _idx_to_wx, _idx_to_wdist = vocab.nn_index( all_vecs, nAssign, checks=config['checks']) if nAssign > 1: idx_to_wxs, idx_to_maws = smk_funcs.weight_multi_assigns( _idx_to_wx, _idx_to_wdist, massign_alpha=1.2, massign_sigma=80.0, massign_equal_weights=True, ) else: idx_to_wxs = np.ma.masked_array(_idx_to_wx, fill_value=-1) idx_to_maws = np.ma.ones(idx_to_wxs.shape, fill_value=-1, dtype=np.float32) idx_to_maws.mask = idx_to_wxs.mask assign_tup = (idx_to_wxs, idx_to_maws) dassign_cacher.save(assign_tup) idx_to_wxs, idx_to_maws = assign_tup # Breakup vectors, keypoints, and word assignments by annotation wx_lists = [ idx_to_wxs[left:right] for left, right in ut.itertwo(offset_list) ] maw_lists = [ idx_to_maws[left:right] for left, right in ut.itertwo(offset_list) ] vecs_list = [ all_vecs[left:right] for left, right in ut.itertwo(offset_list) ] kpts_list = [ all_kpts[left:right] for left, right in ut.itertwo(offset_list) ] # ======================= # FIND QUERY SUBREGIONS # ======================= ibs, query_annots, data_annots, qx_to_dx = load_ordered_annots( data_uri_order, query_uri_order) daids = data_annots.aids qaids = query_annots.aids query_super_kpts = ut.take(kpts_list, qx_to_dx) query_super_vecs = ut.take(vecs_list, qx_to_dx) query_super_wxs = ut.take(wx_lists, qx_to_dx) query_super_maws = ut.take(maw_lists, qx_to_dx) # Mark which keypoints are within the bbox of the query query_flags_list = [] only_xy = config['only_xy'] for kpts_, bbox in zip(query_super_kpts, query_annots.bboxes): flags = kpts_inside_bbox(kpts_, bbox, only_xy=only_xy) query_flags_list.append(flags) logger.info('Queries are crops of existing database images.') logger.info('Looking at average percents') percent_list = [ flags_.sum() / flags_.shape[0] for flags_ in query_flags_list ] percent_stats = ut.get_stats(percent_list) logger.info('percent_stats = %s' % (ut.repr4(percent_stats), )) import vtool as vt query_kpts = vt.zipcompress(query_super_kpts, query_flags_list, axis=0) query_vecs = vt.zipcompress(query_super_vecs, query_flags_list, axis=0) query_wxs = vt.zipcompress(query_super_wxs, query_flags_list, axis=0) query_maws = vt.zipcompress(query_super_maws, query_flags_list, axis=0) # ======================= # CONSTRUCT QUERY / DATABASE REPR # ======================= # int_rvec = not config['dtype'].startswith('float') int_rvec = config['int_rvec'] X_list = [] _prog = ut.ProgPartial(length=len(qaids), label='new X', bs=True, adjust=True) for aid, fx_to_wxs, fx_to_maws in _prog( zip(qaids, query_wxs, query_maws)): X = new_external_annot(aid, fx_to_wxs, fx_to_maws, int_rvec) X_list.append(X) # ydata_cacher = SMKCacher('ydata') # Y_list = ydata_cacher.tryload() # if Y_list is None: Y_list = [] _prog = ut.ProgPartial(length=len(daids), label='new Y', bs=True, adjust=True) for aid, fx_to_wxs, fx_to_maws in _prog(zip(daids, wx_lists, maw_lists)): Y = new_external_annot(aid, fx_to_wxs, fx_to_maws, int_rvec) Y_list.append(Y) # ydata_cacher.save(Y_list) # ====================== # Add in some groundtruth logger.info('Add in some groundtruth') for Y, nid in zip(Y_list, ibs.get_annot_nids(daids)): Y.nid = nid for X, nid in zip(X_list, ibs.get_annot_nids(qaids)): X.nid = nid for Y, qual in zip(Y_list, ibs.get_annot_quality_texts(daids)): Y.qual = qual # ====================== # Add in other properties for Y, vecs, kpts in zip(Y_list, vecs_list, kpts_list): Y.vecs = vecs Y.kpts = kpts imgdir = ut.truepath('/raid/work/Oxford/oxbuild_images') for Y, imgid in zip(Y_list, data_uri_order): gpath = ut.unixjoin(imgdir, imgid + '.jpg') Y.gpath = gpath for X, vecs, kpts in zip(X_list, query_vecs, query_kpts): X.kpts = kpts X.vecs = vecs # ====================== logger.info('Building inverted list') daids = [Y.aid for Y in Y_list] # wx_list = sorted(ut.list_union(*[Y.wx_list for Y in Y_list])) wx_list = sorted(set.union(*[Y.wx_set for Y in Y_list])) assert daids == data_annots.aids assert len(wx_list) <= config['num_words'] wx_to_aids = smk_funcs.invert_lists(daids, [Y.wx_list for Y in Y_list], all_wxs=wx_list) # Compute IDF weights logger.info('Compute IDF weights') ndocs_total = len(daids) # Use only the unique number of words ndocs_per_word = np.array([len(set(wx_to_aids[wx])) for wx in wx_list]) logger.info('ndocs_perword stats: ' + ut.repr4(ut.get_stats(ndocs_per_word))) idf_per_word = smk_funcs.inv_doc_freq(ndocs_total, ndocs_per_word) wx_to_weight = dict(zip(wx_list, idf_per_word)) logger.info('idf stats: ' + ut.repr4(ut.get_stats(wx_to_weight.values()))) # Filter junk Y_list_ = [Y for Y in Y_list if Y.qual != 'junk'] # ======================= # CHOOSE QUERY KERNEL # ======================= params = { 'asmk': dict(alpha=3.0, thresh=0.0), 'bow': dict(), 'bow2': dict(), } # method = 'bow' method = 'bow2' method = 'asmk' smk = SMK(wx_to_weight, method=method, **params[method]) # Specific info for the type of query if method == 'asmk': # Make residual vectors if True: # The stacked way is 50x faster # TODO: extend for multi-assignment and record fxs flat_query_vecs = np.vstack(query_vecs) flat_query_wxs = np.vstack(query_wxs) flat_query_offsets = np.array( [0] + ut.cumsum(ut.lmap(len, query_wxs))) flat_wxs_assign = flat_query_wxs flat_offsets = flat_query_offsets flat_vecs = flat_query_vecs tup = smk_funcs.compute_stacked_agg_rvecs( words, flat_wxs_assign, flat_vecs, flat_offsets) all_agg_vecs, all_error_flags, agg_offset_list = tup if int_rvec: all_agg_vecs = smk_funcs.cast_residual_integer( all_agg_vecs) agg_rvecs_list = [ all_agg_vecs[left:right] for left, right in ut.itertwo(agg_offset_list) ] agg_flags_list = [ all_error_flags[left:right] for left, right in ut.itertwo(agg_offset_list) ] for X, agg_rvecs, agg_flags in zip(X_list, agg_rvecs_list, agg_flags_list): X.agg_rvecs = agg_rvecs X.agg_flags = agg_flags[:, None] flat_wxs_assign = idx_to_wxs flat_offsets = offset_list flat_vecs = all_vecs tup = smk_funcs.compute_stacked_agg_rvecs( words, flat_wxs_assign, flat_vecs, flat_offsets) all_agg_vecs, all_error_flags, agg_offset_list = tup if int_rvec: all_agg_vecs = smk_funcs.cast_residual_integer( all_agg_vecs) agg_rvecs_list = [ all_agg_vecs[left:right] for left, right in ut.itertwo(agg_offset_list) ] agg_flags_list = [ all_error_flags[left:right] for left, right in ut.itertwo(agg_offset_list) ] for Y, agg_rvecs, agg_flags in zip(Y_list, agg_rvecs_list, agg_flags_list): Y.agg_rvecs = agg_rvecs Y.agg_flags = agg_flags[:, None] else: # This non-stacked way is about 500x slower _prog = ut.ProgPartial(label='agg Y rvecs', bs=True, adjust=True) for Y in _prog(Y_list_): make_agg_vecs(Y, words, Y.vecs) _prog = ut.ProgPartial(label='agg X rvecs', bs=True, adjust=True) for X in _prog(X_list): make_agg_vecs(X, words, X.vecs) elif method == 'bow2': # Hack for orig tf-idf bow vector nwords = len(words) for X in ut.ProgIter(X_list, label='make bow vector'): ensure_tf(X) bow_vector(X, wx_to_weight, nwords) for Y in ut.ProgIter(Y_list_, label='make bow vector'): ensure_tf(Y) bow_vector(Y, wx_to_weight, nwords) if method != 'bow2': for X in ut.ProgIter(X_list, 'compute X gamma'): X.gamma = smk.gamma(X) for Y in ut.ProgIter(Y_list_, 'compute Y gamma'): Y.gamma = smk.gamma(Y) # Execute matches (could go faster by enumerating candidates) scores_list = [] for X in ut.ProgIter(X_list, label='query %s' % (smk, )): scores = [smk.kernel(X, Y) for Y in Y_list_] scores = np.array(scores) scores = np.nan_to_num(scores) scores_list.append(scores) import sklearn.metrics avep_list = [] _iter = list(zip(scores_list, X_list)) _iter = ut.ProgIter(_iter, label='evaluate %s' % (smk, )) for scores, X in _iter: truth = [X.nid == Y.nid for Y in Y_list_] avep = sklearn.metrics.average_precision_score(truth, scores) avep_list.append(avep) avep_list = np.array(avep_list) mAP = np.mean(avep_list) logger.info('mAP = %r' % (mAP, ))
def load_oxford_2007(): """ Loads data from http://www.robots.ox.ac.uk:5000/~vgg/publications/2007/Philbin07/philbin07.pdf >>> from wbia.algo.smk.script_smk import * # NOQA """ from os.path import join, basename, splitext import pandas as pd import vtool as vt dbdir = ut.truepath('/raid/work/Oxford/') data_fpath0 = join(dbdir, 'data_2007.pkl') if ut.checkpath(data_fpath0): data = ut.load_data(data_fpath0) return data else: word_dpath = join(dbdir, 'word_oxc1_hesaff_sift_16M_1M') _word_fpath_list = ut.ls(word_dpath) imgid_to_word_fpath = { splitext(basename(word_fpath))[0]: word_fpath for word_fpath in _word_fpath_list } readme_fpath = join(dbdir, 'README2.txt') imgid_order = ut.readfrom(readme_fpath).split('\n')[20:-1] imgid_order = imgid_order data_uri_order = [x.replace('oxc1_', '') for x in imgid_order] imgid_to_df = {} for imgid in ut.ProgIter(imgid_order, label='reading kpts'): word_fpath = imgid_to_word_fpath[imgid] row_gen = (map(float, line.strip('\n').split(' ')) for line in ut.read_lines_from(word_fpath)[2:]) rows = [(int(word_id), x, y, e11, e12, e22) for (word_id, x, y, e11, e12, e22) in row_gen] df = pd.DataFrame( rows, columns=['word_id', 'x', 'y', 'e11', 'e12', 'e22']) imgid_to_df[imgid] = df df_list = ut.take(imgid_to_df, imgid_order) nfeat_list = [len(df_) for df_ in df_list] offset_list = [0] + ut.cumsum(nfeat_list) shape = (offset_list[-1], 128) # shape = (16334970, 128) sift_fpath = join(dbdir, 'OxfordSIFTDescriptors', 'feat_oxc1_hesaff_sift.bin') try: file_ = open(sift_fpath, 'rb') with ut.Timer('Reading SIFT binary file'): nbytes = np.prod(shape) all_vecs = np.fromstring(file_.read(nbytes), dtype=np.uint8) all_vecs = all_vecs.reshape(shape) finally: file_.close() kpts_list = [ df_.loc[:, ('x', 'y', 'e11', 'e12', 'e22')].values for df_ in df_list ] wordid_list = [df_.loc[:, 'word_id'].values for df_ in df_list] kpts_Z = np.vstack(kpts_list) idx_to_wx = np.hstack(wordid_list) # assert len(np.unique(idx_to_wx)) == 1E6 # Reqd standard query order query_files = sorted( ut.glob(dbdir + '/oxford_groundtruth', '*_query.txt')) query_uri_order = [] for qpath in query_files: text = ut.readfrom(qpath, verbose=0) query_uri = text.split(' ')[0].replace('oxc1_', '') query_uri_order.append(query_uri) logger.info('converting to invV') all_kpts = vt.convert_kptsZ_to_kpts(kpts_Z) data = { 'offset_list': offset_list, 'all_kpts': all_kpts, 'all_vecs': all_vecs, 'idx_to_wx': idx_to_wx, 'data_uri_order': data_uri_order, 'query_uri_order': query_uri_order, } ut.save_data(data_fpath0, data) return data
def compute_stacked_agg_rvecs(words, flat_wxs_assign, flat_vecs, flat_offsets): """ More efficient version of agg on a stacked structure Args: words (ndarray): entire vocabulary of words flat_wxs_assign (ndarray): maps a stacked index to word index flat_vecs (ndarray): stacked SIFT descriptors flat_offsets (ndarray): offset positions per annotation Example: >>> # DISABLE_DOCTEST >>> from wbia.algo.smk.smk_funcs import * # NOQA >>> data = testdata_rvecs(dim=2, nvecs=1000, nannots=10) >>> words = data['words'] >>> flat_offsets = data['offset_list'] >>> flat_wxs_assign, flat_vecs = ut.take(data, ['idx_to_wx', 'vecs']) >>> tup = compute_stacked_agg_rvecs(words, flat_wxs_assign, flat_vecs, flat_offsets) >>> all_agg_vecs, all_error_flags, agg_offset_list = tup >>> agg_rvecs_list = [all_agg_vecs[l:r] for l, r in ut.itertwo(agg_offset_list)] >>> agg_flags_list = [all_error_flags[l:r] for l, r in ut.itertwo(agg_offset_list)] >>> assert len(agg_flags_list) == len(flat_offsets) - 1 Example: >>> # DISABLE_DOCTEST >>> from wbia.algo.smk.smk_funcs import * # NOQA >>> data = testdata_rvecs(dim=2, nvecs=100, nannots=5) >>> words = data['words'] >>> flat_offsets = data['offset_list'] >>> flat_wxs_assign, flat_vecs = ut.take(data, ['idx_to_wx', 'vecs']) >>> tup = compute_stacked_agg_rvecs(words, flat_wxs_assign, flat_vecs, flat_offsets) >>> all_agg_vecs, all_error_flags, agg_offset_list = tup >>> agg_rvecs_list = [all_agg_vecs[l:r] for l, r in ut.itertwo(agg_offset_list)] >>> agg_flags_list = [all_error_flags[l:r] for l, r in ut.itertwo(agg_offset_list)] >>> assert len(agg_flags_list) == len(flat_offsets) - 1 """ grouped_wxs = [ flat_wxs_assign[left:right] for left, right in ut.itertwo(flat_offsets) ] # Assume single assignment, aggregate everything # across the entire database flat_offsets = np.array(flat_offsets) idx_to_dx = (np.searchsorted( flat_offsets, np.arange(len(flat_wxs_assign)), side='right') - 1).astype(np.int32) if isinstance(flat_wxs_assign, np.ma.masked_array): wx_list = flat_wxs_assign.T[0].compressed() else: wx_list = flat_wxs_assign.T[0].ravel() unique_wx, groupxs = vt.group_indices(wx_list) dim = flat_vecs.shape[1] if isinstance(flat_wxs_assign, np.ma.masked_array): dx_to_wxs = [np.unique(wxs.compressed()) for wxs in grouped_wxs] else: dx_to_wxs = [np.unique(wxs.ravel()) for wxs in grouped_wxs] dx_to_nagg = [len(wxs) for wxs in dx_to_wxs] num_agg_vecs = sum(dx_to_nagg) # all_agg_wxs = np.hstack(dx_to_wxs) agg_offset_list = np.array([0] + ut.cumsum(dx_to_nagg)) # Preallocate agg residuals for all dxs all_agg_vecs = np.empty((num_agg_vecs, dim), dtype=np.float32) all_agg_vecs[:, :] = np.nan # precompute agg residual stack i_to_dxs = vt.apply_grouping(idx_to_dx, groupxs) subgroup = [vt.group_indices(dxs) for dxs in ut.ProgIter(i_to_dxs)] i_to_unique_dxs = ut.take_column(subgroup, 0) i_to_dx_groupxs = ut.take_column(subgroup, 1) num_words = len(unique_wx) # Overall this takes 5 minutes and 21 seconds # I think the other method takes about 12 minutes for i in ut.ProgIter(range(num_words), 'agg'): wx = unique_wx[i] xs = groupxs[i] dxs = i_to_unique_dxs[i] dx_groupxs = i_to_dx_groupxs[i] word = words[wx:wx + 1] offsets1 = agg_offset_list.take(dxs) offsets2 = [np.where(dx_to_wxs[dx] == wx)[0][0] for dx in dxs] offsets = np.add(offsets1, offsets2, out=offsets1) # if __debug__: # assert np.bincount(dxs).max() < 2 # offset = agg_offset_list[dxs[0]] # assert np.all(dx_to_wxs[dxs[0]] == all_agg_wxs[offset:offset + # dx_to_nagg[dxs[0]]]) # Compute residuals rvecs = flat_vecs[xs] - word vt.normalize(rvecs, axis=1, out=rvecs) rvecs[np.all(np.isnan(rvecs), axis=1)] = 0 # Aggregate across same images grouped_rvecs = vt.apply_grouping(rvecs, dx_groupxs, axis=0) agg_rvecs_ = [rvec_group.sum(axis=0) for rvec_group in grouped_rvecs] # agg_rvecs = np.vstack(agg_rvecs_) all_agg_vecs[offsets, :] = agg_rvecs_ assert not np.any(np.isnan(all_agg_vecs)) logger.info('Apply normalization') vt.normalize(all_agg_vecs, axis=1, out=all_agg_vecs) all_error_flags = np.all(np.isnan(all_agg_vecs), axis=1) all_agg_vecs[all_error_flags, :] = 0 # ndocs_per_word1 = np.array(ut.lmap(len, wx_to_unique_dxs)) # ndocs_total1 = len(flat_offsets) - 1 # idf1 = smk_funcs.inv_doc_freq(ndocs_total1, ndocs_per_word1) tup = all_agg_vecs, all_error_flags, agg_offset_list return tup
def make_score_tabular( row_lbls, col_lbls, values, title=None, out_of=None, bold_best=False, flip=False, bigger_is_better=True, multicol_lbls=None, FORCE_INT=False, precision=None, SHORTEN_ROW_LBLS=False, col_align='l', col_sep='|', multicol_sep='|', centerline=True, astable=False, table_position='', AUTOFIX_LATEX=True, **kwargs): r""" makes a LaTeX tabular for displaying scores or errors Args: row_lbls (list of str): col_lbls (list of str): values (ndarray): title (str): (default = None) out_of (None): (default = None) bold_best (bool): (default = True) flip (bool): (default = False) table_position (str) : eg '[h]' Returns: str: tabular_str CommandLine: python -m utool.util_latex --test-make_score_tabular:0 --show python -m utool.util_latex --test-make_score_tabular:1 --show python -m utool.util_latex --test-make_score_tabular:2 --show Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1', 'config2'] >>> col_lbls = ['score \leq 1', 'metric2'] >>> values = np.array([[1.2, 2], [3.2, 4]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1'] >>> col_lbls = ['score \leq 1', 'metric2'] >>> values = np.array([[1.2, 2]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1', 'config2'] >>> col_lbls = ['score \leq 1', 'metric2', 'foobar'] >>> multicol_lbls = [('spam', 1), ('eggs', 2)] >>> values = np.array([[1.2, 2, -3], [3.2, 4, -2]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip, multicol_lbls=multicol_lbls) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) """ import utool as ut if flip: bigger_is_better = not bigger_is_better flip_repltups = [ ('<=', '>'), ('>', '<='), ('\\leq', '\\gt'), ('\\geq', '\\lt'), ('score', 'error') ] col_lbls = [replace_all(lbl, flip_repltups) for lbl in col_lbls] if title is not None: title = replace_all(title, flip_repltups) if out_of is not None: values = out_of - values # Abbreviate based on common substrings common_rowlbl = None if SHORTEN_ROW_LBLS: if isinstance(row_lbls, list): row_lbl_list = row_lbls else: row_lbl_list = row_lbls.flatten().tolist() # Split the rob labels into the alg components #algcomp_list = [lbl.split(')_') for lbl in row_lbl_list] longest = long_substr(row_lbl_list) common_strs = [] while len(longest) > 10: common_strs += [longest] row_lbl_list = [row.replace(longest, '...') for row in row_lbl_list] longest = long_substr(row_lbl_list) common_rowlbl = ('...'.join(common_strs)).replace(')_', ')_\n') row_lbls = row_lbl_list if len(row_lbl_list) == 1: common_rowlbl = row_lbl_list[0] row_lbls = ['0'] # Stack values into a tabular body # TODO: need ability to specify datatypes def ensurelist(row_values): try: return row_values.tolist() except AttributeError: return row_values if False: # Numpy formatting def padvec(shape=(1, 1)): pad = np.array([[' ' for c in range(shape[1])] for r in range(shape[0])]) return pad col_lbls = ensure_rowvec(col_lbls) row_lbls = ensure_colvec(row_lbls) _0 = np.vstack([padvec(), row_lbls]) _1 = np.vstack([col_lbls, values]) body = np.hstack([_0, _1]) body = [[str_ for str_ in row] for row in body] else: assert len(row_lbls) == len(values) body = [[' '] + col_lbls] body += [[row_lbl] + ensurelist(row_values) for row_lbl, row_values in zip(row_lbls, values)] #import utool as ut # Fix things in each body cell DO_PERCENT = True try: for r in range(len(body)): for c in range(len(body[0])): # In data land if r > 0 and c > 0: if precision is not None: # Hack if ut.is_float(body[r][c]): fmtstr = '%.' + str(precision) + 'f' body[r][c] = fmtstr % (float(body[r][c]),) # Force integer if FORCE_INT: body[r][c] = str(int(float(body[r][c]))) body[r][c] = str(body[r][c]) # Remove bad formatting; if AUTOFIX_LATEX: body[r][c] = escape_latex(body[r][c]) except Exception as ex: import utool as ut print('len(row_lbls) = %r' % (len(row_lbls),)) print('len(col_lbls) = %r' % (len(col_lbls),)) print('len(values) = %r' % (values,)) print('ut.depth_profile(values) = %r' % (ut.depth_profile(values),)) util_dbg.printex(ex, keys=['r', 'c']) raise # Bold the best values if bold_best: best_col_scores = values.max(0) if bigger_is_better else values.min(0) rows_to_bold = [np.where(values[:, colx] == best_col_scores[colx])[0] for colx in range(len(values.T))] for colx, rowx_list in enumerate(rows_to_bold): for rowx in rowx_list: body[rowx + 1][colx + 1] = '\\txtbf{' + body[rowx + 1][colx + 1] + '}' # More fixing after the bold is in place for r in range(len(body)): for c in range(len(body[0])): # In data land if r > 0 and c > 0: if out_of is not None: body[r][c] = body[r][c] + '/' + str(out_of) if DO_PERCENT: percent = ' = %.1f%%' % float(100 * values[r - 1, c - 1] / out_of) body[r][c] += escape_latex(percent) # Align columns for pretty printing body = np.array(body) ALIGN_BODY = True if ALIGN_BODY: new_body_cols = [] for col in body.T: colstrs = list(map(str, ensurelist(col))) collens = list(map(len, colstrs)) maxlen = max(collens) newcols = [str_ + (' ' * (maxlen - len(str_))) for str_ in colstrs] new_body_cols += [newcols] body = np.array(new_body_cols).T # Build Body (and row layout) HLINE_SEP = True rowvalsep = '' colvalsep = ' & ' endl = '\\\\\n' hline = r'\hline' #extra_rowsep_pos_list = [1] # rows to insert an extra hline after extra_rowsep_pos_list = [] # rows to insert an extra hline after if HLINE_SEP: rowvalsep = hline + '\n' # rowstr list holds blocks of rows rowstr_list = [colvalsep.join(row) + endl for row in body] #rowstr_list = [row[0] + rowlbl_sep + colvalsep.join(row[1:]) + endl for row in body] #rowstr_list = [( # ('' if len(row) == 0 else row[0]) # if len(row) <= 1 else # row[0] + rowlblcol_sep + colvalsep.join(row[1:]) + endl) # for row in body] rowsep_list = [rowvalsep for row in rowstr_list[0:-1]] # should be len 1 less than rowstr_list # Insert multicolumn names if multicol_lbls is not None: # TODO: label of the row labels multicol_sep multicols = [latex_multicolumn(multicol, size, 'c' + multicol_sep) for multicol, size in multicol_lbls] multicol_str = latex_multirow('', 2) + colvalsep + colvalsep.join(multicols) + endl ncols = sum([tup[1] for tup in multicol_lbls]) mcol_sep = '\\cline{2-%d}\n' % (ncols + 1,) rowstr_list = [multicol_str] + rowstr_list rowsep_list = [mcol_sep] + rowsep_list #extra_rowsep_pos_list += [1] # Insert title if title is not None and not astable: tex_title = latex_multicolumn(title, len(body[0])) + endl rowstr_list = [tex_title] + rowstr_list rowsep_list = [rowvalsep] + rowsep_list #extra_rowsep_pos_list += [2] # Apply an extra hline (for label) #extra_rowsep_pos_list = [] for pos in sorted(extra_rowsep_pos_list)[::-1]: rowstr_list.insert(pos, '') rowsep_list.insert(pos, rowvalsep) #tabular_body = rowvalsep.join(rowstr_list) from six.moves import zip_longest tabular_body = ''.join([row if sep is None else row + sep for row, sep in zip_longest(rowstr_list, rowsep_list)]) # Build Column Layout col_align_list = [col_align] * len(body[0]) #extra_collayoutsep_pos_list = [1] extra_collayoutsep_pos_list = [] for pos in sorted(extra_collayoutsep_pos_list)[::-1]: col_align_list.insert(pos, '') #col_layaout_sep_list = rowlblcol_sep # TODO rowlblcol_sep = '|' # Build build internal seprations between column alignments # Defaults to just the normal col_sep col_align_sep_list = [col_sep] * (len(col_align_list) - 1) # Adjust for the separations between row labels and the actual row data if len(col_align_sep_list) > 0: col_align_sep_list[0] = rowlblcol_sep # Continue multicolumn sepratation if multicol_lbls is not None: multicol_offsets = ut.cumsum(ut.get_list_column(multicol_lbls, 1)) for offset in multicol_offsets: if offset < len(col_align_sep_list): col_align_sep_list[offset] = multicol_sep from six.moves import zip_longest _tmp = [ut.filter_Nones(tup) for tup in zip_longest(col_align_list, col_align_sep_list)] col_layout = ''.join(ut.flatten(_tmp)) #if len(col_align_list) > 1: # col_layout = col_align_list[0] + rowlblcol_sep + col_sep.join(col_align_list[1:]) #else: # col_layout = col_sep.join(col_align_list) tabular_head = (r'\begin{tabular}{|%s|}' % col_layout) + '\n' tabular_tail = r'\end{tabular}' if centerline: tabular_head = r'\centerline{' + '\n' + tabular_head tabular_tail = tabular_tail + '}' if astable: #tabular_head = r'\begin{centering}' + '\n' + tabular_head tabular_head = r'\centering' + '\n' + tabular_head tabular_head = r'\begin{table}' + table_position + '\n' + tabular_head lblstr = latex_sanitize_command_name(kwargs.get('label', title)) caption = title if AUTOFIX_LATEX: caption = escape_latex(caption) caption = '\n% ---\n' + caption + '\n% ---\n' #tabular_head = r'\end{centering}' + '\n' + tabular_head tabular_tail = tabular_tail + '\n\caption[%s]{%s}\n\label{tbl:%s}\n\end{table}' % (lblstr, caption, lblstr) tabular_str = rowvalsep.join([tabular_head, tabular_body, tabular_tail]) topsep = '\\hline\n' if True else '\\toprule\n' botsep = '\\hline\n' if True else '\\bottomrule\n' tabular_str = tabular_head + topsep + tabular_body + botsep + tabular_tail if common_rowlbl is not None: #tabular_str += escape_latex('\n\nThe following parameters were held fixed:\n' + common_rowlbl) pass return tabular_str
def make_score_tabular(row_lbls, col_lbls, values, title=None, out_of=None, bold_best=False, flip=False, bigger_is_better=True, multicol_lbls=None, FORCE_INT=False, precision=None, SHORTEN_ROW_LBLS=False, col_align='l', col_sep='|', multicol_sep='|', centerline=True, astable=False, table_position='', AUTOFIX_LATEX=True, **kwargs): r""" makes a LaTeX tabular for displaying scores or errors Args: row_lbls (list of str): col_lbls (list of str): values (ndarray): title (str): (default = None) out_of (None): (default = None) bold_best (bool): (default = True) flip (bool): (default = False) table_position (str) : eg '[h]' Returns: str: tabular_str CommandLine: python -m utool.util_latex --test-make_score_tabular:0 --show python -m utool.util_latex --test-make_score_tabular:1 --show python -m utool.util_latex --test-make_score_tabular:2 --show Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1', 'config2'] >>> col_lbls = ['score \leq 1', 'metric2'] >>> values = np.array([[1.2, 2], [3.2, 4]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1'] >>> col_lbls = ['score \leq 1', 'metric2'] >>> values = np.array([[1.2, 2]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> row_lbls = ['config1', 'config2'] >>> col_lbls = ['score \leq 1', 'metric2', 'foobar'] >>> multicol_lbls = [('spam', 1), ('eggs', 2)] >>> values = np.array([[1.2, 2, -3], [3.2, 4, -2]]) >>> title = 'title' >>> out_of = 10 >>> bold_best = True >>> flip = False >>> tabular_str = make_score_tabular(row_lbls, col_lbls, values, title, out_of, bold_best, flip, multicol_lbls=multicol_lbls) >>> result = tabular_str >>> print(result) >>> ut.quit_if_noshow() >>> render_latex_text(tabular_str) """ import utool as ut if flip: bigger_is_better = not bigger_is_better flip_repltups = [('<=', '>'), ('>', '<='), ('\\leq', '\\gt'), ('\\geq', '\\lt'), ('score', 'error')] col_lbls = [replace_all(lbl, flip_repltups) for lbl in col_lbls] if title is not None: title = replace_all(title, flip_repltups) if out_of is not None: values = out_of - values # Abbreviate based on common substrings common_rowlbl = None if SHORTEN_ROW_LBLS: if isinstance(row_lbls, list): row_lbl_list = row_lbls else: row_lbl_list = row_lbls.flatten().tolist() # Split the rob labels into the alg components #algcomp_list = [lbl.split(')_') for lbl in row_lbl_list] longest = long_substr(row_lbl_list) common_strs = [] while len(longest) > 10: common_strs += [longest] row_lbl_list = [ row.replace(longest, '...') for row in row_lbl_list ] longest = long_substr(row_lbl_list) common_rowlbl = ('...'.join(common_strs)).replace(')_', ')_\n') row_lbls = row_lbl_list if len(row_lbl_list) == 1: common_rowlbl = row_lbl_list[0] row_lbls = ['0'] # Stack values into a tabular body # TODO: need ability to specify datatypes def ensurelist(row_values): try: return row_values.tolist() except AttributeError: return row_values if False: # Numpy formatting def padvec(shape=(1, 1)): pad = np.array([[' ' for c in range(shape[1])] for r in range(shape[0])]) return pad col_lbls = ensure_rowvec(col_lbls) row_lbls = ensure_colvec(row_lbls) _0 = np.vstack([padvec(), row_lbls]) _1 = np.vstack([col_lbls, values]) body = np.hstack([_0, _1]) body = [[str_ for str_ in row] for row in body] else: assert len(row_lbls) == len(values) body = [[' '] + col_lbls] body += [[row_lbl] + ensurelist(row_values) for row_lbl, row_values in zip(row_lbls, values)] #import utool as ut # Fix things in each body cell DO_PERCENT = True try: for r in range(len(body)): for c in range(len(body[0])): # In data land if r > 0 and c > 0: if precision is not None: # Hack if ut.is_float(body[r][c]): fmtstr = '%.' + str(precision) + 'f' body[r][c] = fmtstr % (float(body[r][c]), ) # Force integer if FORCE_INT: body[r][c] = str(int(float(body[r][c]))) body[r][c] = str(body[r][c]) # Remove bad formatting; if AUTOFIX_LATEX: body[r][c] = escape_latex(body[r][c]) except Exception as ex: import utool as ut print('len(row_lbls) = %r' % (len(row_lbls), )) print('len(col_lbls) = %r' % (len(col_lbls), )) print('len(values) = %r' % (values, )) print('ut.depth_profile(values) = %r' % (ut.depth_profile(values), )) util_dbg.printex(ex, keys=['r', 'c']) raise # Bold the best values if bold_best: best_col_scores = values.max(0) if bigger_is_better else values.min(0) rows_to_bold = [ np.where(values[:, colx] == best_col_scores[colx])[0] for colx in range(len(values.T)) ] for colx, rowx_list in enumerate(rows_to_bold): for rowx in rowx_list: body[rowx + 1][colx + 1] = '\\txtbf{' + body[rowx + 1][colx + 1] + '}' # More fixing after the bold is in place for r in range(len(body)): for c in range(len(body[0])): # In data land if r > 0 and c > 0: if out_of is not None: body[r][c] = body[r][c] + '/' + str(out_of) if DO_PERCENT: percent = ' = %.1f%%' % float( 100 * values[r - 1, c - 1] / out_of) body[r][c] += escape_latex(percent) # Align columns for pretty printing body = np.array(body) ALIGN_BODY = True if ALIGN_BODY: new_body_cols = [] for col in body.T: colstrs = list(map(str, ensurelist(col))) collens = list(map(len, colstrs)) maxlen = max(collens) newcols = [str_ + (' ' * (maxlen - len(str_))) for str_ in colstrs] new_body_cols += [newcols] body = np.array(new_body_cols).T # Build Body (and row layout) HLINE_SEP = True rowvalsep = '' colvalsep = ' & ' endl = '\\\\\n' hline = r'\hline' #extra_rowsep_pos_list = [1] # rows to insert an extra hline after extra_rowsep_pos_list = [] # rows to insert an extra hline after if HLINE_SEP: rowvalsep = hline + '\n' # rowstr list holds blocks of rows rowstr_list = [colvalsep.join(row) + endl for row in body] #rowstr_list = [row[0] + rowlbl_sep + colvalsep.join(row[1:]) + endl for row in body] #rowstr_list = [( # ('' if len(row) == 0 else row[0]) # if len(row) <= 1 else # row[0] + rowlblcol_sep + colvalsep.join(row[1:]) + endl) # for row in body] rowsep_list = [rowvalsep for row in rowstr_list[0:-1] ] # should be len 1 less than rowstr_list # Insert multicolumn names if multicol_lbls is not None: # TODO: label of the row labels multicol_sep multicols = [ latex_multicolumn(multicol, size, 'c' + multicol_sep) for multicol, size in multicol_lbls ] multicol_str = latex_multirow( '', 2) + colvalsep + colvalsep.join(multicols) + endl ncols = sum([tup[1] for tup in multicol_lbls]) mcol_sep = '\\cline{2-%d}\n' % (ncols + 1, ) rowstr_list = [multicol_str] + rowstr_list rowsep_list = [mcol_sep] + rowsep_list #extra_rowsep_pos_list += [1] # Insert title if title is not None and not astable: tex_title = latex_multicolumn(title, len(body[0])) + endl rowstr_list = [tex_title] + rowstr_list rowsep_list = [rowvalsep] + rowsep_list #extra_rowsep_pos_list += [2] # Apply an extra hline (for label) #extra_rowsep_pos_list = [] for pos in sorted(extra_rowsep_pos_list)[::-1]: rowstr_list.insert(pos, '') rowsep_list.insert(pos, rowvalsep) #tabular_body = rowvalsep.join(rowstr_list) from six.moves import zip_longest tabular_body = ''.join([ row if sep is None else row + sep for row, sep in zip_longest(rowstr_list, rowsep_list) ]) # Build Column Layout col_align_list = [col_align] * len(body[0]) #extra_collayoutsep_pos_list = [1] extra_collayoutsep_pos_list = [] for pos in sorted(extra_collayoutsep_pos_list)[::-1]: col_align_list.insert(pos, '') #col_layaout_sep_list = rowlblcol_sep # TODO rowlblcol_sep = '|' # Build build internal seprations between column alignments # Defaults to just the normal col_sep col_align_sep_list = [col_sep] * (len(col_align_list) - 1) # Adjust for the separations between row labels and the actual row data if len(col_align_sep_list) > 0: col_align_sep_list[0] = rowlblcol_sep # Continue multicolumn sepratation if multicol_lbls is not None: multicol_offsets = ut.cumsum(ut.get_list_column(multicol_lbls, 1)) for offset in multicol_offsets: if offset < len(col_align_sep_list): col_align_sep_list[offset] = multicol_sep from six.moves import zip_longest _tmp = [ ut.filter_Nones(tup) for tup in zip_longest(col_align_list, col_align_sep_list) ] col_layout = ''.join(ut.flatten(_tmp)) #if len(col_align_list) > 1: # col_layout = col_align_list[0] + rowlblcol_sep + col_sep.join(col_align_list[1:]) #else: # col_layout = col_sep.join(col_align_list) tabular_head = (r'\begin{tabular}{|%s|}' % col_layout) + '\n' tabular_tail = r'\end{tabular}' if centerline: tabular_head = r'\centerline{' + '\n' + tabular_head tabular_tail = tabular_tail + '}' if astable: #tabular_head = r'\begin{centering}' + '\n' + tabular_head tabular_head = r'\centering' + '\n' + tabular_head tabular_head = r'\begin{table}' + table_position + '\n' + tabular_head lblstr = latex_sanitize_command_name(kwargs.get('label', title)) caption = title if AUTOFIX_LATEX: caption = escape_latex(caption) caption = '\n% ---\n' + caption + '\n% ---\n' #tabular_head = r'\end{centering}' + '\n' + tabular_head tabular_tail = tabular_tail + '\n\caption[%s]{%s}\n\label{tbl:%s}\n\end{table}' % ( lblstr, caption, lblstr) tabular_str = rowvalsep.join([tabular_head, tabular_body, tabular_tail]) topsep = '\\hline\n' if True else '\\toprule\n' botsep = '\\hline\n' if True else '\\bottomrule\n' tabular_str = tabular_head + topsep + tabular_body + botsep + tabular_tail if common_rowlbl is not None: #tabular_str += escape_latex('\n\nThe following parameters were held fixed:\n' + common_rowlbl) pass return tabular_str