def find_duplicates(index): # fpaths = list(index.files.keys()) files = list(index.files.values()) print('Grouping {} files'.format(len(files))) grouped = ut.group_items(files, [f.nbytes for f in files]) print('Found {} groups'.format(len(grouped))) potential_dups = {k: v for k, v in grouped.items() if len(v) > 1} print('Found {} potential dups by nbytes'.format(len(potential_dups))) GB = 2**30 # NOQA MB = 2**20 # NOQA max_bytes = 10 * MB min_bytes = 64 * MB duplicates = [] for k, fs in ut.ProgIter(potential_dups.items(), freq=1): names = [f.n for f in fs] if ut.allsame(names): # Don't do big files yet if k < max_bytes and k > min_bytes: if ut.allsame([f.hashid for f in fs]): duplicates.extend(fs) for f1, f2 in ut.combinations(fs, 2): f1.duplicates.add(f2) f2.duplicates.add(f1) def dpath_similarity(index, dpath1, dpath2): d1 = index[dpath1] d2 = index[dpath2] set1 = {f.hashid for f in ut.ProgIter(d1.files)} set2 = {f.hashid for f in ut.ProgIter(d2.files)} # n_isect = len(set1.intersection(set2)) size1, size2 = map(len, (set1, set2)) # minsize = min(size1, size2) # sim_measures = (n_isect, n_isect / minsize) return ut.set_overlaps(set1, set2) # return sim_measures similarities = {} r_to_dup = ut.group_items(duplicates, [p.r for p in duplicates]) for dpath, dups in r_to_dup.items(): # Check to see if the duplicates all point to the same dir f = dups[0] # NOQA common_dpath = set.intersection(*[{_.r for _ in f.duplicates} for f in dups]) for other in common_dpath: sim_measures = dpath_similarity(index, dpath, other) similarities[(dpath, other)] = sim_measures print(ut.repr4(similarities, si=True, nl=2))
def find_duplicates(index): # fpaths = list(index.files.keys()) files = list(index.files.values()) print('Grouping {} files'.format(len(files))) grouped = ut.group_items(files, [f.nbytes for f in files]) print('Found {} groups'.format(len(grouped))) potential_dups = {k: v for k, v in grouped.items() if len(v) > 1} print('Found {} potential dups by nbytes'.format(len(potential_dups))) GB = 2 ** 30 # NOQA MB = 2 ** 20 # NOQA max_bytes = 10 * MB min_bytes = 64 * MB duplicates = [] for k, fs in ut.ProgIter(potential_dups.items(), freq=1): names = [f.n for f in fs] if ut.allsame(names): # Don't do big files yet if k < max_bytes and k > min_bytes: if ut.allsame([f.hashid for f in fs]): duplicates.extend(fs) for f1, f2 in ut.combinations(fs, 2): f1.duplicates.add(f2) f2.duplicates.add(f1) def dpath_similarity(index, dpath1, dpath2): d1 = index[dpath1] d2 = index[dpath2] set1 = {f.hashid for f in ut.ProgIter(d1.files)} set2 = {f.hashid for f in ut.ProgIter(d2.files)} # n_isect = len(set1.intersection(set2)) size1, size2 = map(len, (set1, set2)) # minsize = min(size1, size2) # sim_measures = (n_isect, n_isect / minsize) return ut.set_overlaps(set1, set2) # return sim_measures similarities = {} r_to_dup = ut.group_items(duplicates, [p.r for p in duplicates]) for dpath, dups in r_to_dup.items(): # Check to see if the duplicates all point to the same dir f = dups[0] # NOQA common_dpath = set.intersection(*[ {_.r for _ in f.duplicates} for f in dups]) for other in common_dpath: sim_measures = dpath_similarity(index, dpath, other) similarities[(dpath, other)] = sim_measures print(ut.repr4(similarities, si=True, nl=2))
def group_daids_for_indexing_by_name(ibs, daid_list, num_indexers=8, verbose=True): """ returns groups with only one annotation per name in each group """ tup = ibs.group_annots_by_known_names(daid_list) aidgroup_list, invalid_aids = tup largest_groupsize = max(map(len, aidgroup_list)) num_bins = min(largest_groupsize, num_indexers) if verbose or ut.VERYVERBOSE: print('[mindex] num_indexers = %d ' % (num_indexers,)) print('[mindex] largest_groupsize = %d ' % (largest_groupsize,)) print('[mindex] num_bins = %d ' % (num_bins,)) # Group annotations for indexing according to the split criteria aids_list, overflow_aids = ut.sample_zip( aidgroup_list, num_bins, allow_overflow=True, per_bin=1) if __debug__: # All groups have the same name nidgroup_list = ibs.unflat_map(ibs.get_annot_name_rowids, aidgroup_list) for nidgroup in nidgroup_list: assert ut.allsame(nidgroup), 'bad name grouping' if __debug__: # All subsiquent indexer are subsets (in name/identity space) # of the previous nids_list = ibs.unflat_map(ibs.get_annot_name_rowids, aids_list) prev_ = None for nids in nids_list: if prev_ is None: prev_ = set(nids) else: assert prev_.issuperset(nids), 'bad indexer grouping' return aids_list, overflow_aids, num_bins
def __init__(self, clf_list, voting='soft', weights=None): self.clf_list = clf_list self.voting = voting self.weights = None classes_list = [clf.classes_ for clf in clf_list] if ut.allsame(classes_list): self.classes_ = classes_list[0] self.class_idx_mappers = None else: # Need to make a mapper from individual clf classes to ensemble self.class_idx_mappers = [] classes_ = sorted(set.union(*map(set, classes_list))) for clf in clf_list: # For each index of the clf classes, find that index in the # ensemble classes. Eg. class y=4 might be at cx=1 and ex=0 mapper = np.empty(len(clf.classes_), dtype=np.int) for cx, y in enumerate(clf.classes_): ex = classes_.index(y) mapper[cx] = ex self.class_idx_mappers.append(mapper) self.classes_ = np.array(classes_) for clf in clf_list: clf.classes_ pass
def get_varied_acfg_labels(acfg_list, mainkey='_cfgname', checkname=False): """ >>> from ibeis.expt.annotation_configs import * # NOQA """ #print(ut.list_str(varied_acfg_list, nl=2)) for acfg in acfg_list: assert acfg['qcfg'][mainkey] == acfg['dcfg'][mainkey], ( 'should be the same for now') cfgname_list = [acfg['qcfg'][mainkey] for acfg in acfg_list] if checkname and ut.allsame(cfgname_list): cfgname_list = [None] * len(cfgname_list) # Hack to make common params between q and d appear the same _acfg_list = [compress_aidcfg(acfg) for acfg in acfg_list] flat_acfg_list = flatten_acfg_list(_acfg_list) nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list( flat_acfg_list) SUPER_HACK = True if SUPER_HACK: # SUPER HACK, recompress remake the varied list after knowing what is varied _varied_keys = list(set(ut.flatten( [list(ut.flatten( [list(x.keys()) for x in unflatten_acfgdict(cfg).values()] )) for cfg in varied_acfg_list] ))) _acfg_list = [ compress_aidcfg(acfg, force_noncommon=_varied_keys) for acfg in acfg_list] flat_acfg_list = flatten_acfg_list(_acfg_list) nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list( flat_acfg_list) shortened_cfg_list = [ #{shorten_to_alias_labels(key): val for key, val in _dict.items()} ut.map_dict_keys(shorten_to_alias_labels, _dict) for _dict in varied_acfg_list] nonlbl_keys = ut.INTERNAL_CFGKEYS nonlbl_keys = [prefix + key for key in nonlbl_keys for prefix in ['', 'q', 'd']] # hack for sorting by q/d stuff first def get_key_order(cfg): keys = [k for k in cfg.keys() if k not in nonlbl_keys] sortorder = [2 * k.startswith('q') + 1 * k.startswith('d') for k in keys] return ut.sortedby(keys, sortorder)[::-1] cfglbl_list = [ ut.get_cfg_lbl(cfg, name, nonlbl_keys, key_order=get_key_order(cfg)) for cfg, name in zip(shortened_cfg_list, cfgname_list)] if checkname: cfglbl_list = [x.lstrip(':') for x in cfglbl_list] return cfglbl_list
def convert_cv2_images_to_theano_images(img_list): r""" Converts b01c to bc01 Converts a list of cv2-style images into a single numpy array of nonflat theano-style images. h=height, w=width, b=batchid, c=channel Args: img_list (list of ndarrays): a list of numpy arrays with shape [h, w, c] Returns: data: in the shape [b, (c x h x w)] CommandLine: python -m ibeis_cnn.utils --test-convert_cv2_images_to_theano_images Example: >>> # ENABLE_DOCTEST >>> from ibeis_cnn.utils import * # NOQA >>> import vtool as vt >>> # build test data >>> # execute function >>> img_list, width, height, channels = testdata_imglist() >>> data = convert_cv2_images_to_theano_images(img_list) >>> data[0].reshape(3, 32, 32)[:, 0:2, 0:2] >>> subset = (data[0].reshape(3, 32, 32)[:, 0:2, 0:2]) >>> #result = str(np.transpose(subset, (1, 2, 0))) >>> result = str(subset).replace('\n', '') >>> print(result) [[[ 0 3] [ 96 99]] [[ 1 4] [ 97 100]] [[ 2 5] [ 98 101]]] """ #[img.shape for img in img_list] # format to [b, c, h, w] if len(img_list.shape) == 3: # ensure 4 dimensions img_list = img_list.reshape(img_list.shape + (1, )) shape_list = [img.shape for img in img_list] assert ut.allsame(shape_list) theano_style_imgs = [ np.transpose(img, (2, 0, 1))[None, :] for img in img_list ] data = np.vstack(theano_style_imgs) #data = np.vstack([img[None, :] for img in img_list]) return data
def measure_metrics(infr): real_pos_edges = [] n_true_merges = infr.test_state['n_true_merges'] confusion = infr.test_state['confusion'] n_tp = confusion[POSTV][POSTV] confusion[POSTV] columns = set(confusion.keys()) reviewd_cols = columns - {UNREV} non_postv = reviewd_cols - {POSTV} non_negtv = reviewd_cols - {NEGTV} n_fn = sum(ut.take(confusion[POSTV], non_postv)) n_fp = sum(ut.take(confusion[NEGTV], non_negtv)) n_error_edges = sum(confusion[r][c] + confusion[c][r] for r, c in ut.combinations(reviewd_cols, 2)) # assert n_fn + n_fp == n_error_edges pred_n_pcc_mst_edges = n_true_merges if 0: import ubelt as ub for timer in ub.Timerit(10): with timer: # Find undetectable errors num_undetectable_fn = 0 for nid1, nid2 in infr.neg_redun_metagraph.edges(): cc1 = infr.pos_graph.component(nid1) cc2 = infr.pos_graph.component(nid2) neg_edges = nxu.edges_cross(infr.neg_graph, cc1, cc2) for u, v in neg_edges: real_nid1 = infr.node_truth[u] real_nid2 = infr.node_truth[v] if real_nid1 == real_nid2: num_undetectable_fn += 1 break # Find undetectable errors num_undetectable_fp = 0 for nid in infr.pos_redun_nids: cc = infr.pos_graph.component(nid) if not ut.allsame(ut.take(infr.node_truth, cc)): num_undetectable_fp += 1 print('num_undetectable_fn = %r' % (num_undetectable_fn,)) print('num_undetectable_fp = %r' % (num_undetectable_fp,)) if 0: n_error_edges2 = 0 n_fn2 = 0 n_fp2 = 0 for edge, data in infr.edges(data=True): decision = data.get('evidence_decision', UNREV) true_state = infr.edge_truth[edge] if true_state == decision and true_state == POSTV: real_pos_edges.append(edge) elif decision != UNREV: if true_state != decision: n_error_edges2 += 1 if true_state == POSTV: n_fn2 += 1 elif true_state == NEGTV: n_fp2 += 1 assert n_error_edges2 == n_error_edges assert n_tp == len(real_pos_edges) assert n_fn == n_fn2 assert n_fp == n_fp2 # pred_n_pcc_mst_edges2 = sum( # len(cc) - 1 for cc in infr.test_gt_pos_graph.connected_components() # ) if False: import networkx as nx # set(infr.test_gt_pos_graph.edges()) == set(real_pos_edges) pred_n_pcc_mst_edges = 0 for cc in nx.connected_components(nx.Graph(real_pos_edges)): pred_n_pcc_mst_edges += len(cc) - 1 assert n_true_merges == pred_n_pcc_mst_edges # Find all annotations involved in a mistake assert n_error_edges == len(infr.mistake_edges) direct_mistake_aids = {a for edge in infr.mistake_edges for a in edge} mistake_nids = set(infr.node_labels(*direct_mistake_aids)) mistake_aids = set(ut.flatten([infr.pos_graph.component(nid) for nid in mistake_nids])) pos_acc = pred_n_pcc_mst_edges / infr.real_n_pcc_mst_edges metrics = { 'n_decision': infr.test_state['n_decision'], 'n_manual': infr.test_state['n_manual'], 'n_algo': infr.test_state['n_algo'], 'phase': infr.loop_phase, 'pos_acc': pos_acc, 'n_merge_total': infr.real_n_pcc_mst_edges, 'n_merge_remain': infr.real_n_pcc_mst_edges - n_true_merges, 'n_true_merges': n_true_merges, 'recovering': infr.is_recovering(), # 'recovering2': infr.test_state['recovering'], 'merge_remain': 1 - pos_acc, 'n_mistake_aids': len(mistake_aids), 'frac_mistake_aids': len(mistake_aids) / len(infr.aids), 'n_mistake_nids': len(mistake_nids), 'n_errors': n_error_edges, 'n_fn': n_fn, 'n_fp': n_fp, 'refresh_support': len(infr.refresh.manual_decisions), 'pprob_any': infr.refresh.prob_any_remain(), 'mu': infr.refresh._ewma, 'test_action': infr.test_state['test_action'], 'action': infr.test_state.get('action', None), 'user_id': infr.test_state['user_id'], 'pred_decision': infr.test_state['pred_decision'], 'true_decision': infr.test_state['true_decision'], 'n_neg_redun': infr.neg_redun_metagraph.number_of_edges(), 'n_neg_redun1': (infr.neg_metagraph.number_of_edges() - infr.neg_metagraph.number_of_selfloops()), } return metrics
def get_varied_acfg_labels(acfg_list, mainkey='_cfgname', checkname=False): """ >>> from ibeis.expt.annotation_configs import * # NOQA """ #print(ut.list_str(varied_acfg_list, nl=2)) for acfg in acfg_list: assert acfg['qcfg'][mainkey] == acfg['dcfg'][mainkey], ( 'should be the same for now') cfgname_list = [acfg['qcfg'][mainkey] for acfg in acfg_list] if checkname and ut.allsame(cfgname_list): cfgname_list = [None] * len(cfgname_list) # Hack to make common params between q and d appear the same _acfg_list = [compress_aidcfg(acfg) for acfg in acfg_list] flat_acfg_list = flatten_acfg_list(_acfg_list) nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list( flat_acfg_list) SUPER_HACK = True if SUPER_HACK: # SUPER HACK, recompress remake the varied list after knowing what is varied _varied_keys = list( set( ut.flatten([ list( ut.flatten([ list(x.keys()) for x in unflatten_acfgdict(cfg).values() ])) for cfg in varied_acfg_list ]))) _acfg_list = [ compress_aidcfg(acfg, force_noncommon=_varied_keys) for acfg in acfg_list ] flat_acfg_list = flatten_acfg_list(_acfg_list) nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list( flat_acfg_list) shortened_cfg_list = [ #{shorten_to_alias_labels(key): val for key, val in _dict.items()} ut.map_dict_keys(shorten_to_alias_labels, _dict) for _dict in varied_acfg_list ] nonlbl_keys = ut.INTERNAL_CFGKEYS nonlbl_keys = [ prefix + key for key in nonlbl_keys for prefix in ['', 'q', 'd'] ] # hack for sorting by q/d stuff first def get_key_order(cfg): keys = [k for k in cfg.keys() if k not in nonlbl_keys] sortorder = [ 2 * k.startswith('q') + 1 * k.startswith('d') for k in keys ] return ut.sortedby(keys, sortorder)[::-1] cfglbl_list = [ ut.get_cfg_lbl(cfg, name, nonlbl_keys, key_order=get_key_order(cfg)) for cfg, name in zip(shortened_cfg_list, cfgname_list) ] if checkname: cfglbl_list = [x.lstrip(':') for x in cfglbl_list] return cfglbl_list
def exec_interactive_incremental_queries(ibs, qaid_list, back=None): assert ut.allsame(ibs.get_annot_species_rowids(qaid_list)), "must be all on same species" self = IncQueryHarness() self = self.begin_incremental_query(ibs, qaid_list, back=back)
def expand(sample, denc_per_name=[1], extra_dbsize_fracs=[0]): # Vary the number of database encounters in each sample target_daids_list = [] target_info_list_ = [] for num in denc_per_name: dname_encs_ = ut.take_column(sample.dname_encs, slice(0, num)) dnames_ = ut.lmap(ut.flatten, dname_encs_) daids_ = ut.total_flatten(dname_encs_) target_daids_list.append(daids_) name_lens = ut.lmap(len, dnames_) dpername = name_lens[0] if ut.allsame(name_lens) else np.mean( name_lens) target_info_list_.append( ut.odict([ ('qsize', len(sample.qaids)), ('t_n_names', len(dname_encs_)), ('t_dpername', dpername), ('t_denc_pername', num), ('t_dsize', len(daids_)), ])) # Append confusors to maintain a constant dbsize in each base sample dbsize_list = ut.lmap(len, target_daids_list) max_dsize = max(dbsize_list) n_need = max_dsize - min(dbsize_list) n_extra_avail = len(sample.confusor_pool) - n_need assert len(sample.confusor_pool) > n_need, 'not enough confusors' padded_daids_list = [] padded_info_list_ = [] for daids_, info_ in zip(target_daids_list, target_info_list_): num_take = max_dsize - len(daids_) pad_aids = sample.confusor_pool[:num_take] new_aids = daids_ + pad_aids info_ = info_.copy() info_['n_pad'] = len(pad_aids) info_['pad_dsize'] = len(new_aids) padded_info_list_.append(info_) padded_daids_list.append(new_aids) # Vary the dbsize by appending extra confusors if extra_dbsize_fracs is None: extra_dbsize_fracs = [1.0] extra_fracs = np.array(extra_dbsize_fracs) n_extra_list = np.unique(extra_fracs * n_extra_avail).astype(np.int) daids_list = [] info_list = [] for n in n_extra_list: for daids_, info_ in zip(padded_daids_list, padded_info_list_): extra_aids = sample.confusor_pool[len(sample.confusor_pool) - n:] daids = sorted(daids_ + extra_aids) daids_list.append(daids) info = info_.copy() info['n_extra'] = len(extra_aids) info['dsize'] = len(daids) info_list.append(info) import pandas as pd verbose = 0 if verbose: logger.info(pd.DataFrame.from_records(info_list)) logger.info('#qaids = %r' % (len(sample.qaids), )) logger.info('num_need = %r' % (n_need, )) logger.info('max_dsize = %r' % (max_dsize, )) return sample.qaids, daids_list, info_list
def compute_residual_assignments(depc, fid_list, vocab_id_list, config): r""" CommandLine: python -m ibeis.control.IBEISControl show_depc_annot_table_input \ --show --tablename=residuals Ignore: ibs.depc['vocab'].print_table() Ignore: data = ibs.depc.get('inverted_agg_assign', ([1, 2473], qreq_.daids), config=qreq_.config) wxs1 = data[0][0] wxs2 = data[1][0] # Lev Example import ibeis ibs = ibeis.opendb('Oxford') depc = ibs.depc table = depc['inverted_agg_assign'] table.print_table() table.print_internal_info() Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.smk.inverted_index import * # NOQA >>> # Test depcache access >>> import ibeis >>> ibs, aid_list = ibeis.testdata_aids('testdb1') >>> depc = ibs.depc_annot >>> config = {'num_words': 1000, 'nAssign': 1} >>> #input_tuple = (aid_list, [aid_list] * len(aid_list)) >>> daids = aid_list >>> input_tuple = (daids, [daids]) >>> rowid_kw = {} >>> tablename = 'inverted_agg_assign' >>> target_tablename = tablename >>> input_ids = depc.get_parent_rowids(tablename, input_tuple, config) >>> fid_list = ut.take_column(input_ids, 0) >>> vocab_id_list = ut.take_column(input_ids, 1) >>> data = depc.get(tablename, input_tuple, config) >>> tup = dat[1] Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.smk.inverted_index import * # NOQA >>> import ibeis >>> qreq_ = ibeis.testdata_qreq_(defaultdb='Oxford', a='oxford', p='default:proot=smk,nAssign=1,num_words=64000') >>> config = {'num_words': 64000, 'nAssign': 1, 'int_rvec': True} >>> depc = qreq_.ibs.depc >>> daids = qreq_.daids >>> input_tuple = (daids, [daids]) >>> rowid_kw = {} >>> tablename = 'inverted_agg_assign' >>> target_tablename = tablename >>> input_ids = depc.get_parent_rowids(tablename, input_tuple, config) >>> fid_list = ut.take_column(input_ids, 0) >>> vocab_id_list = ut.take_column(input_ids, 1) """ #print('[IBEIS] ASSIGN RESIDUALS:') assert ut.allsame(vocab_id_list) vocabid = vocab_id_list[0] # NEED HACK TO NOT LOAD INDEXER EVERY TIME this_table = depc['inverted_agg_assign'] vocab_table = depc['vocab'] if this_table._hack_chunk_cache is not None and vocabid in this_table._hack_chunk_cache: vocab = this_table._hack_chunk_cache[vocabid] else: vocab = vocab_table.get_row_data([vocabid], 'words')[0] if this_table._hack_chunk_cache is not None: this_table._hack_chunk_cache[vocabid] = vocab print('Grab Vecs') vecs_list = depc.get_native('feat', fid_list, 'vecs') nAssign = config['nAssign'] int_rvec = config['int_rvec'] from concurrent import futures print('Building residual args') worker = residual_worker args_gen = gen_residual_args(vocab, vecs_list, nAssign, int_rvec) args_gen = [ args for args in ut.ProgIter( args_gen, length=len(vecs_list), lbl='building args') ] # nprocs = ut.num_unused_cpus(thresh=10) - 1 nprocs = ut.num_cpus() print('Creating %d processes' % (nprocs, )) executor = futures.ProcessPoolExecutor(nprocs) try: print('Submiting workers') fs_chunk = [ executor.submit(worker, args) for args in ut.ProgIter(args_gen, lbl='submit proc') ] for fs in ut.ProgIter(fs_chunk, lbl='getting phi result'): tup = fs.result() yield tup except Exception: raise finally: executor.shutdown(wait=True)
def are_nodes_connected(self, u, v): return ut.allsame(self.node_labels(u, v))
def understanding_pseudomax_props(mode=2): """ Function showing some properties of distances between normalized pseudomax vectors CommandLine: python -m vtool.distance --test-understanding_pseudomax_props Example: >>> # ENABLE_DOCTEST >>> from vtool.distance import * # NOQA >>> for mode in [0, 1, 2, 3]: ... print('+---') ... print('mode = %r' % (mode,)) ... result = understanding_pseudomax_props(mode) ... print('L___') >>> print(result) """ import vtool as vt pseudo_max = 512 rng = np.random.RandomState(0) num = 10 if mode == 0: dim = 2 p1_01 = (vt.normalize_rows(rng.rand(num, dim))) p2_01 = (vt.normalize_rows(rng.rand(num, dim))) elif mode == 1: p1_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max p2_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max elif mode == 2: # Build theoretically maximally distant normalized vectors (type 1) dim = 128 p1_01 = np.zeros((1, dim)) p2_01 = np.zeros((1, dim)) p2_01[:, 0::2] = 1 p1_01[:, 1::2] = 1 p1_01 = vt.normalize_rows(p1_01) p2_01 = vt.normalize_rows(p2_01) elif mode == 3: # Build theoretically maximally distant vectors (type 2) # This mode will clip if cast to uint8, thus failing the test dim = 128 p1_01 = np.zeros((1, dim)) p2_01 = np.zeros((1, dim)) p2_01[:, 0] = 1 p1_01[:, 1:] = 1 p1_01 = vt.normalize_rows(p1_01) p2_01 = vt.normalize_rows(p2_01) pass print('ndims = %r' % (p1_01.shape[1])) p1_01 = p1_01.astype(TEMP_VEC_DTYPE) p2_01 = p2_01.astype(TEMP_VEC_DTYPE) p1_256 = p1_01 * pseudo_max p2_256 = p2_01 * pseudo_max dist_sqrd_01 = vt.L2_sqrd(p1_01, p2_01) dist_sqrd_256 = vt.L2_sqrd(p1_256, p2_256) dist_01 = np.sqrt(dist_sqrd_01) dist_256 = np.sqrt(dist_sqrd_256) print('dist_sqrd_01 = %s' % (ut.numpy_str(dist_sqrd_01, precision=2), )) print('dist_sqrd_256 = %s' % (ut.numpy_str(dist_sqrd_256, precision=2), )) print('dist_01 = %s' % (ut.numpy_str(dist_01, precision=2), )) print('dist_256 = %s' % (ut.numpy_str(dist_256, precision=2), )) print('--') print('sqrt(2) = %f' % (np.sqrt(2))) print('--') assert np.all(dist_01 == vt.L2(p1_01, p2_01)) assert np.all(dist_256 == vt.L2(p1_256, p2_256)) const_sqrd = dist_sqrd_256 / dist_sqrd_01 const = dist_256 / dist_01 print('const = %r' % (const[0], )) print('const_sqrd = %r' % (const_sqrd[0], )) print('1 / const = %r' % (1 / const[0], )) print('1 / const_sqrd = %r' % (1 / const_sqrd[0], )) assert ut.allsame(const) assert ut.allsame(const_sqrd) assert np.all(const == np.sqrt(const_sqrd)) # Assert that distance conversions work assert np.all(dist_256 / const == dist_01) assert np.all(dist_sqrd_256 / const_sqrd == dist_sqrd_01) print('Conversions work') print('Maximal L2 distance between any two NON-NEGATIVE L2-NORMALIZED' ' vectors should always be sqrt(2)')
def understanding_pseudomax_props(mode=2): """ Function showing some properties of distances between normalized pseudomax vectors CommandLine: python -m vtool.distance --test-understanding_pseudomax_props Example: >>> # ENABLE_DOCTEST >>> from vtool.distance import * # NOQA >>> for mode in [0, 1, 2, 3]: ... print('+---') ... print('mode = %r' % (mode,)) ... result = understanding_pseudomax_props(mode) ... print('L___') >>> print(result) """ import vtool as vt pseudo_max = 512 rng = np.random.RandomState(0) num = 10 if mode == 0: dim = 2 p1_01 = (vt.normalize_rows(rng.rand(num, dim))) p2_01 = (vt.normalize_rows(rng.rand(num, dim))) elif mode == 1: p1_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max p2_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max elif mode == 2: # Build theoretically maximally distant normalized vectors (type 1) dim = 128 p1_01 = np.zeros((1, dim)) p2_01 = np.zeros((1, dim)) p2_01[:, 0::2] = 1 p1_01[:, 1::2] = 1 p1_01 = vt.normalize_rows(p1_01) p2_01 = vt.normalize_rows(p2_01) elif mode == 3: # Build theoretically maximally distant vectors (type 2) # This mode will clip if cast to uint8, thus failing the test dim = 128 p1_01 = np.zeros((1, dim)) p2_01 = np.zeros((1, dim)) p2_01[:, 0] = 1 p1_01[:, 1:] = 1 p1_01 = vt.normalize_rows(p1_01) p2_01 = vt.normalize_rows(p2_01) pass print('ndims = %r' % (p1_01.shape[1])) p1_01 = p1_01.astype(TEMP_VEC_DTYPE) p2_01 = p2_01.astype(TEMP_VEC_DTYPE) p1_256 = p1_01 * pseudo_max p2_256 = p2_01 * pseudo_max dist_sqrd_01 = vt.L2_sqrd(p1_01, p2_01) dist_sqrd_256 = vt.L2_sqrd(p1_256, p2_256) dist_01 = np.sqrt(dist_sqrd_01) dist_256 = np.sqrt(dist_sqrd_256) print('dist_sqrd_01 = %s' % (ut.numpy_str(dist_sqrd_01, precision=2),)) print('dist_sqrd_256 = %s' % (ut.numpy_str(dist_sqrd_256, precision=2),)) print('dist_01 = %s' % (ut.numpy_str(dist_01, precision=2),)) print('dist_256 = %s' % (ut.numpy_str(dist_256, precision=2),)) print('--') print('sqrt(2) = %f' % (np.sqrt(2))) print('--') assert np.all(dist_01 == vt.L2(p1_01, p2_01)) assert np.all(dist_256 == vt.L2(p1_256, p2_256)) const_sqrd = dist_sqrd_256 / dist_sqrd_01 const = dist_256 / dist_01 print('const = %r' % (const[0],)) print('const_sqrd = %r' % (const_sqrd[0],)) print('1 / const = %r' % (1 / const[0],)) print('1 / const_sqrd = %r' % (1 / const_sqrd[0],)) assert ut.allsame(const) assert ut.allsame(const_sqrd) assert np.all(const == np.sqrt(const_sqrd)) # Assert that distance conversions work assert np.all(dist_256 / const == dist_01) assert np.all(dist_sqrd_256 / const_sqrd == dist_sqrd_01) print('Conversions work') print('Maximal L2 distance between any two NON-NEGATIVE L2-NORMALIZED' ' vectors should always be sqrt(2)')
def new_cpd(self, parents=None, pmf_func=None): """ Makes a new random variable that is an instance of this tempalte parents : only used to define the name of this node. """ if pmf_func is None: pmf_func = self.pmf_func # --- MAKE VARIABLE ID def _getid(obj): if isinstance(obj, int): return str(obj) elif isinstance(obj, six.string_types): return obj else: return obj._template_id if not ut.isiterable(parents): parents = [parents] template_ids = [_getid(cpd) for cpd in parents] HACK_SAME_IDS = True # TODO: keep track of parent index inheritence # then rectify uniqueness based on that if HACK_SAME_IDS and ut.allsame(template_ids): _id = template_ids[0] else: _id = ''.join(template_ids) variable = ''.join([self.varpref, _id]) # variable = '_'.join([self.varpref, '{' + _id + '}']) # variable = '$%s$' % (variable,) evidence_cpds = [cpd for cpd in parents if hasattr(cpd, 'ttype')] if len(evidence_cpds) == 0: evidence_cpds = None variable_card = len(self.basis) statename_dict = { variable: self.basis, } if self.evidence_ttypes is not None: if any(cpd.ttype != tcpd.ttype for cpd, tcpd in zip(evidence_cpds, evidence_cpds)): raise ValueError('Evidence is not of appropriate type') evidence_bases = [cpd.variable_statenames for cpd in evidence_cpds] evidence_card = list(map(len, evidence_bases)) evidence_states = list(ut.iprod(*evidence_bases)) for cpd in evidence_cpds: _dict = ut.dict_subset(cpd.statename_dict, [cpd.variable]) statename_dict.update(_dict) evidence = [cpd.variable for cpd in evidence_cpds] else: if evidence_cpds is not None: raise ValueError('Gave evidence for evidence-less template') evidence = None evidence_card = None # --- MAKE TABLE VALUES if pmf_func is not None: if isinstance(pmf_func, list): values = np.array(pmf_func) else: values = np.array([[ pmf_func(vstate, *estates) for estates in evidence_states ] for vstate in self.basis]) ensure_normalized = True if ensure_normalized: values = values / values.sum(axis=0) else: # assume uniform fill_value = 1.0 / variable_card if evidence_card is None: values = np.full((1, variable_card), fill_value) else: values = np.full([variable_card] + list(evidence_card), fill_value) try: cpd = pgmpy.factors.TabularCPD( variable=variable, variable_card=variable_card, values=values, evidence=evidence, evidence_card=evidence_card, # statename_dict=statename_dict, state_names=statename_dict, ) except Exception as ex: ut.printex( ex, 'Failed to create TabularCPD', keys=[ 'variable', 'variable_card', 'statename_dict', 'evidence_card', 'evidence', 'values.shape', ], ) ut.embed() raise cpd.ttype = self.ttype cpd._template_ = self cpd._template_id = _id return cpd
def new_cpd(self, parents=None, pmf_func=None): """ Makes a new random variable that is an instance of this tempalte parents : only used to define the name of this node. """ if pmf_func is None: pmf_func = self.pmf_func # --- MAKE VARIABLE ID def _getid(obj): if isinstance(obj, int): return str(obj) elif isinstance(obj, six.string_types): return obj else: return obj._template_id if not ut.isiterable(parents): parents = [parents] template_ids = [_getid(cpd) for cpd in parents] HACK_SAME_IDS = True # TODO: keep track of parent index inheritence # then rectify uniqueness based on that if HACK_SAME_IDS and ut.allsame(template_ids): _id = template_ids[0] else: _id = ''.join(template_ids) variable = ''.join([self.varpref, _id]) #variable = '_'.join([self.varpref, '{' + _id + '}']) #variable = '$%s$' % (variable,) evidence_cpds = [cpd for cpd in parents if hasattr(cpd, 'ttype')] if len(evidence_cpds) == 0: evidence_cpds = None variable_card = len(self.basis) statename_dict = { variable: self.basis, } if self.evidence_ttypes is not None: if any(cpd.ttype != tcpd.ttype for cpd, tcpd in zip(evidence_cpds, evidence_cpds)): raise ValueError('Evidence is not of appropriate type') evidence_bases = [cpd.variable_statenames for cpd in evidence_cpds] evidence_card = list(map(len, evidence_bases)) evidence_states = list(ut.iprod(*evidence_bases)) for cpd in evidence_cpds: _dict = ut.dict_subset(cpd.statename_dict, [cpd.variable]) statename_dict.update(_dict) evidence = [cpd.variable for cpd in evidence_cpds] else: if evidence_cpds is not None: raise ValueError('Gave evidence for evidence-less template') evidence = None evidence_card = None # --- MAKE TABLE VALUES if pmf_func is not None: if isinstance(pmf_func, list): values = np.array(pmf_func) else: values = np.array([ [pmf_func(vstate, *estates) for estates in evidence_states] for vstate in self.basis ]) ensure_normalized = True if ensure_normalized: values = values / values.sum(axis=0) else: # assume uniform fill_value = 1.0 / variable_card if evidence_card is None: values = np.full((1, variable_card), fill_value) else: values = np.full([variable_card] + list(evidence_card), fill_value) try: cpd = pgmpy.factors.TabularCPD( variable=variable, variable_card=variable_card, values=values, evidence=evidence, evidence_card=evidence_card, #statename_dict=statename_dict, state_names=statename_dict, ) except Exception as ex: ut.printex(ex, 'Failed to create TabularCPD', keys=[ 'variable', 'variable_card', 'statename_dict', 'evidence_card', 'evidence', 'values.shape', ]) ut.embed() raise cpd.ttype = self.ttype cpd._template_ = self cpd._template_id = _id return cpd
def demo2(): """ CommandLine: python -m wbia.algo.graph.demo demo2 --viz python -m wbia.algo.graph.demo demo2 Example: >>> # DISABLE_DOCTEST >>> from wbia.algo.graph.demo import * # NOQA >>> result = demo2() >>> print(result) """ import wbia.plottool as pt from wbia.scripts.thesis import TMP_RC import matplotlib as mpl mpl.rcParams.update(TMP_RC) # ---- Synthetic data params params = { 'redun.pos': 2, 'redun.neg': 2, } # oracle_accuracy = .98 # oracle_accuracy = .90 # oracle_accuracy = (.8, 1.0) oracle_accuracy = (0.85, 1.0) # oracle_accuracy = 1.0 # --- draw params VISUALIZE = ut.get_argflag('--viz') # QUIT_OR_EMEBED = 'embed' QUIT_OR_EMEBED = 'quit' TARGET_REVIEW = ut.get_argval('--target', type_=int, default=None) START = ut.get_argval('--start', type_=int, default=None) END = ut.get_argval('--end', type_=int, default=None) # ------------------ # rng = np.random.RandomState(42) # infr = demodata_infr(num_pccs=4, size=3, size_std=1, p_incon=0) # infr = demodata_infr(num_pccs=6, size=7, size_std=1, p_incon=0) # infr = demodata_infr(num_pccs=3, size=5, size_std=.2, p_incon=0) infr = demodata_infr(pcc_sizes=[5, 2, 4]) infr.verbose = 100 # apply_dummy_viewpoints(infr) # infr.ensure_cliques() infr.ensure_cliques() infr.ensure_full() # infr.apply_edge_truth() # Dummy scoring infr.init_simulation(oracle_accuracy=oracle_accuracy, name='demo2') # infr_gt = infr.copy() dpath = ut.ensuredir(ut.truepath('~/Desktop/demo')) ut.remove_files_in_dir(dpath) fig_counter = it.count(0) def show_graph(infr, title, final=False, selected_edges=None): if not VISUALIZE: return # TODO: rich colored text? latest = '\n'.join(infr.latest_logs()) showkw = dict( # fontsize=infr.graph.graph['fontsize'], # fontname=infr.graph.graph['fontname'], show_unreviewed_edges=True, show_inferred_same=False, show_inferred_diff=False, outof=(len(infr.aids)), # show_inferred_same=True, # show_inferred_diff=True, selected_edges=selected_edges, show_labels=True, simple_labels=True, # show_recent_review=not final, show_recent_review=False, # splines=infr.graph.graph['splines'], reposition=False, # with_colorbar=True ) verbose = infr.verbose infr.verbose = 0 infr_ = infr.copy() infr_ = infr infr_.verbose = verbose infr_.show(pickable=True, verbose=0, **showkw) infr.verbose = verbose # logger.info('status ' + ut.repr4(infr_.status())) # infr.show(**showkw) ax = pt.gca() pt.set_title(title, fontsize=20) fig = pt.gcf() fontsize = 22 if True: # postprocess xlabel lines = [] for line in latest.split('\n'): if False and line.startswith('ORACLE ERROR'): lines += ['ORACLE ERROR'] else: lines += [line] latest = '\n'.join(lines) if len(lines) > 10: fontsize = 16 if len(lines) > 12: fontsize = 14 if len(lines) > 14: fontsize = 12 if len(lines) > 18: fontsize = 10 if len(lines) > 23: fontsize = 8 if True: pt.adjust_subplots(top=0.95, left=0, right=1, bottom=0.45, fig=fig) ax.set_xlabel('\n' + latest) xlabel = ax.get_xaxis().get_label() xlabel.set_horizontalalignment('left') # xlabel.set_x(.025) xlabel.set_x(-0.6) # xlabel.set_fontname('CMU Typewriter Text') xlabel.set_fontname('Inconsolata') xlabel.set_fontsize(fontsize) ax.set_aspect('equal') # ax.xaxis.label.set_color('red') from os.path import join fpath = join(dpath, 'demo_{:04d}.png'.format(next(fig_counter))) fig.savefig( fpath, dpi=300, # transparent=True, edgecolor='none', ) # pt.save_figure(dpath=dpath, dpi=300) infr.latest_logs() if VISUALIZE: infr.update_visual_attrs(groupby='name_label') infr.set_node_attrs('pin', 'true') node_dict = ut.nx_node_dict(infr.graph) logger.info(ut.repr4(node_dict[1])) if VISUALIZE: infr.latest_logs() # Pin Nodes into the target groundtruth position show_graph(infr, 'target-gt') logger.info(ut.repr4(infr.status())) infr.clear_feedback() infr.clear_name_labels() infr.clear_edges() logger.info(ut.repr4(infr.status())) infr.latest_logs() if VISUALIZE: infr.update_visual_attrs() infr.prioritize('prob_match') if VISUALIZE or TARGET_REVIEW is None or TARGET_REVIEW == 0: show_graph(infr, 'initial state') def on_new_candidate_edges(infr, edges): # hack updateing visual attrs as a callback infr.update_visual_attrs() infr.on_new_candidate_edges = on_new_candidate_edges infr.params.update(**params) infr.refresh_candidate_edges() VIZ_ALL = VISUALIZE and TARGET_REVIEW is None and START is None logger.info('VIZ_ALL = %r' % (VIZ_ALL, )) if VIZ_ALL or TARGET_REVIEW == 0: show_graph(infr, 'find-candidates') # _iter2 = enumerate(infr.generate_reviews(**params)) # _iter2 = list(_iter2) # assert len(_iter2) > 0 # prog = ut.ProgIter(_iter2, label='demo2', bs=False, adjust=False, # enabled=False) count = 1 first = 1 for edge, priority in infr._generate_reviews(data=True): msg = 'review #%d, priority=%.3f' % (count, priority) logger.info('\n----------') infr.print('pop edge {} with priority={:.3f}'.format(edge, priority)) # logger.info('remaining_reviews = %r' % (infr.remaining_reviews()),) # Make the next review if START is not None: VIZ_ALL = count >= START if END is not None and count >= END: break infr.print(msg) if ut.allsame(infr.pos_graph.node_labels(*edge)) and first: # Have oracle make a mistake early feedback = infr.request_oracle_review(edge, accuracy=0) first -= 1 else: feedback = infr.request_oracle_review(edge) AT_TARGET = TARGET_REVIEW is not None and count >= TARGET_REVIEW - 1 SHOW_CANDIATE_POP = True if SHOW_CANDIATE_POP and (VIZ_ALL or AT_TARGET): # import utool # utool.embed() infr.print( ut.repr2(infr.task_probs['match_state'][edge], precision=4, si=True)) infr.print('len(queue) = %r' % (len(infr.queue))) # Show edge selection infr.print('Oracle will predict: ' + feedback['evidence_decision']) show_graph(infr, 'pre' + msg, selected_edges=[edge]) if count == TARGET_REVIEW: infr.EMBEDME = QUIT_OR_EMEBED == 'embed' infr.add_feedback(edge, **feedback) infr.print('len(queue) = %r' % (len(infr.queue))) # infr.apply_nondynamic_update() # Show the result if VIZ_ALL or AT_TARGET: show_graph(infr, msg) # import sys # sys.exit(1) if count == TARGET_REVIEW: break count += 1 infr.print('status = ' + ut.repr4(infr.status(extended=False))) show_graph(infr, 'post-review (#reviews={})'.format(count), final=True) # ROUND 2 FIGHT # if TARGET_REVIEW is None and round2_params is not None: # # HACK TO GET NEW THINGS IN QUEUE # infr.params = round2_params # _iter2 = enumerate(infr.generate_reviews(**params)) # prog = ut.ProgIter(_iter2, label='round2', bs=False, adjust=False, # enabled=False) # for count, (aid1, aid2) in prog: # msg = 'reviewII #%d' % (count) # logger.info('\n----------') # logger.info(msg) # logger.info('remaining_reviews = %r' % (infr.remaining_reviews()),) # # Make the next review evidence_decision # feedback = infr.request_oracle_review(edge) # if count == TARGET_REVIEW: # infr.EMBEDME = QUIT_OR_EMEBED == 'embed' # infr.add_feedback(edge, **feedback) # # Show the result # if PRESHOW or TARGET_REVIEW is None or count >= TARGET_REVIEW - 1: # show_graph(infr, msg) # if count == TARGET_REVIEW: # break # show_graph(infr, 'post-re-review', final=True) if not getattr(infr, 'EMBEDME', False): if ut.get_computer_name().lower() in ['hyrule', 'ooo']: pt.all_figures_tile(monitor_num=0, percent_w=0.5) else: pt.all_figures_tile() ut.show_if_requested()
def convert_hsdb_to_ibeis(hsdir, dbdir=None, **kwargs): r""" Args hsdir (str): Directory to folder *containing* _hsdb dbdir (str): Output directory (defaults to same as hsdb) CommandLine: python -m ibeis convert_hsdb_to_ibeis --dbdir ~/work/Frogs python -m ibeis convert_hsdb_to_ibeis --hsdir "/raid/raw/RotanTurtles/Roatan HotSpotter Nov_21_2016" Ignore: from ibeis.dbio.ingest_hsdb import * # NOQA hsdir = "/raid/raw/RotanTurtles/Roatan HotSpotter Nov_21_2016" dbdir = "~/work/RotanTurtles" Example: >>> # SCRIPT >>> from ibeis.dbio.ingest_hsdb import * # NOQA >>> dbdir = ut.get_argval('--dbdir', type_=str, default=None) >>> hsdir = ut.get_argval('--hsdir', type_=str, default=dbdir) >>> result = convert_hsdb_to_ibeis(hsdir) >>> print(result) """ from ibeis.control import IBEISControl import utool as ut if dbdir is None: dbdir = hsdir print('[ingest] Ingesting hsdb: %r -> %r' % (hsdir, dbdir)) assert is_hsdb( hsdir ), 'not a hotspotter database. cannot even force convert: hsdir=%r' % ( hsdir, ) assert not is_succesful_convert(dbdir), 'hsdir=%r is already converted' % ( hsdir, ) #print('FORCE DELETE: %r' % (hsdir,)) #ibsfuncs.delete_ibeis_database(hsdir) imgdir = join(hsdir, 'images') internal_dir = get_hsinternal(hsdir) nametbl_fpath = join(internal_dir, 'name_table.csv') imgtbl_fpath = join(internal_dir, 'image_table.csv') chiptbl_fpath = join(internal_dir, 'chip_table.csv') # READ NAME TABLE name_text_list = ['____'] name_hs_nid_list = [0] with open(nametbl_fpath, 'r') as nametbl_file: name_reader = csv.reader(nametbl_file) for ix, row in enumerate(name_reader): #if ix >= 3: if len(row) == 0 or row[0].strip().startswith('#'): continue else: hs_nid = int(row[0]) name = row[1].strip() name_text_list.append(name) name_hs_nid_list.append(hs_nid) # READ IMAGE TABLE iamge_hs_gid_list = [] image_gname_list = [] image_reviewed_list = [] with open(imgtbl_fpath, 'r') as imgtb_file: image_reader = csv.reader(imgtb_file) for ix, row in enumerate(image_reader): if len(row) == 0 or row[0].strip().startswith('#'): continue else: hs_gid = int(row[0]) gname_ = row[1].strip() # aif in hotspotter is equivilant to reviewed in IBEIS reviewed = bool(row[2]) iamge_hs_gid_list.append(hs_gid) image_gname_list.append(gname_) image_reviewed_list.append(reviewed) image_gpath_list = [join(imgdir, gname) for gname in image_gname_list] ut.debug_duplicate_items(image_gpath_list) #print(image_gpath_list) image_exist_flags = list(map(exists, image_gpath_list)) missing_images = [] for image_gpath, flag in zip(image_gpath_list, image_exist_flags): if not flag: missing_images.append(image_gpath) print('Image does not exist: %s' % image_gpath) if not all(image_exist_flags): print('Only %d / %d image exist' % (sum(image_exist_flags), len(image_exist_flags))) SEARCH_FOR_IMAGES = False if SEARCH_FOR_IMAGES: # Hack to try and find the missing images from os.path import basename subfiles = ut.glob(hsdir, '*', recursive=True, fullpath=True, with_files=True) basename_to_existing = ut.group_items(subfiles, ut.lmap(basename, subfiles)) can_copy_list = [] for gpath in missing_images: gname = basename(gpath) if gname not in basename_to_existing: print('gname = %r' % (gname, )) pass else: existing = basename_to_existing[gname] can_choose = True if len(existing) > 1: if not ut.allsame(ut.lmap(ut.get_file_uuid, existing)): can_choose = False if can_choose: found = existing[0] can_copy_list.append((found, gpath)) else: print(existing) src, dst = ut.listT(can_copy_list) ut.copy_list(src, dst) # READ CHIP TABLE chip_bbox_list = [] chip_theta_list = [] chip_hs_nid_list = [] chip_hs_gid_list = [] chip_note_list = [] with open(chiptbl_fpath, 'r') as chiptbl_file: chip_reader = csv.reader(chiptbl_file) for ix, row in enumerate(chip_reader): if len(row) == 0 or row[0].strip().startswith('#'): continue else: hs_gid = int(row[1]) hs_nid = int(row[2]) bbox_text = row[3] theta = float(row[4]) notes = '<COMMA>'.join([item.strip() for item in row[5:]]) bbox_text = bbox_text.replace('[', '').replace(']', '').strip() bbox_text = re.sub(' *', ' ', bbox_text) bbox_strlist = bbox_text.split(' ') bbox = tuple(map(int, bbox_strlist)) #bbox = [int(item) for item in bbox_strlist] chip_hs_nid_list.append(hs_nid) chip_hs_gid_list.append(hs_gid) chip_bbox_list.append(bbox) chip_theta_list.append(theta) chip_note_list.append(notes) names = ut.ColumnLists({ 'hs_nid': name_hs_nid_list, 'text': name_text_list, }) images = ut.ColumnLists({ 'hs_gid': iamge_hs_gid_list, 'gpath': image_gpath_list, 'reviewed': image_reviewed_list, 'exists': image_exist_flags, }) chips = ut.ColumnLists({ 'hs_gid': chip_hs_gid_list, 'hs_nid': chip_hs_nid_list, 'bbox': chip_bbox_list, 'theta': chip_theta_list, 'note': chip_note_list, }) IGNORE_MISSING_IMAGES = True if IGNORE_MISSING_IMAGES: # Ignore missing information print('pre') print('chips = %r' % (chips, )) print('images = %r' % (images, )) print('names = %r' % (names, )) missing_gxs = ut.where(ut.not_list(images['exists'])) missing_gids = ut.take(images['hs_gid'], missing_gxs) gid_to_cxs = ut.dzip(*chips.group_indicies('hs_gid')) missing_cxs = ut.flatten(ut.take(gid_to_cxs, missing_gids)) # Remove missing images and dependant chips images = images.remove(missing_gxs) chips = chips.remove(missing_cxs) valid_nids = set(chips['hs_nid'] + [0]) isvalid = [nid in valid_nids for nid in names['hs_nid']] names = names.compress(isvalid) print('post') print('chips = %r' % (chips, )) print('images = %r' % (images, )) print('names = %r' % (names, )) assert all(images['exists']), 'some images dont exist' # if gid is None: # print('Not adding the ix=%r-th Chip. Its image is corrupted image.' % (ix,)) # # continue # # Build mappings to new indexes # names_nid_to_nid = {names_nid: nid for (names_nid, nid) in zip(hs_nid_list, nid_list)} # names_nid_to_nid[1] = names_nid_to_nid[0] # hsdb unknknown is 0 or 1 # images_gid_to_gid = {images_gid: gid for (images_gid, gid) in zip(hs_gid_list, gid_list)} ibs = IBEISControl.request_IBEISController(dbdir=dbdir, check_hsdb=False, **kwargs) assert len(ibs.get_valid_gids()) == 0, 'target database is not empty' # Add names, images, and annotations names['ibs_nid'] = ibs.add_names(names['text']) images['ibs_gid'] = ibs.add_images( images['gpath']) # any failed gids will be None if True: # Remove corrupted images print('pre') print('chips = %r' % (chips, )) print('images = %r' % (images, )) print('names = %r' % (names, )) missing_gxs = ut.where(ut.flag_None_items(images['ibs_gid'])) missing_gids = ut.take(images['hs_gid'], missing_gxs) gid_to_cxs = ut.dzip(*chips.group_indicies('hs_gid')) missing_cxs = ut.flatten(ut.take(gid_to_cxs, missing_gids)) # Remove missing images and dependant chips chips = chips.remove(missing_cxs) images = images.remove(missing_gxs) print('post') print('chips = %r' % (chips, )) print('images = %r' % (images, )) print('names = %r' % (names, )) # Index chips using new ibs rowids ibs_gid_lookup = ut.dzip(images['hs_gid'], images['ibs_gid']) ibs_nid_lookup = ut.dzip(names['hs_nid'], names['ibs_nid']) try: chips['ibs_gid'] = ut.take(ibs_gid_lookup, chips['hs_gid']) except KeyError: chips['ibs_gid'] = [ ibs_gid_lookup.get(index, None) for index in chips['hs_gid'] ] try: chips['ibs_nid'] = ut.take(ibs_nid_lookup, chips['hs_nid']) except KeyError: chips['ibs_nid'] = [ ibs_nid_lookup.get(index, None) for index in chips['hs_nid'] ] ibs.add_annots(chips['ibs_gid'], bbox_list=chips['bbox'], theta_list=chips['theta'], nid_list=chips['ibs_nid'], notes_list=chips['note']) # aid_list = ibs.get_valid_aids() # flag_list = [True] * len(aid_list) # ibs.set_annot_exemplar_flags(aid_list, flag_list) # assert(all(ibs.get_annot_exemplar_flags(aid_list))), 'exemplars not set correctly' # Write file flagging successful conversion with open(join(ibs.get_ibsdir(), SUCCESS_FLAG_FNAME), 'w') as file_: file_.write('Successfully converted hsdir=%r' % (hsdir, )) print('finished ingest') return ibs
def exec_interactive_incremental_queries(ibs, qaid_list, back=None): assert ut.allsame( ibs.get_annot_species_rowids(qaid_list)), 'must be all on same species' self = IncQueryHarness() self = self.begin_incremental_query(ibs, qaid_list, back=back)