def inplace_filter_results(self, filter_pat): import utool as ut self.filter_pats.append(filter_pat) # Get zipflags flags_list = self.pattern_filterflags(filter_pat) # Check to see if there are any survivors flags = ut.lmap(any, flags_list) # found_lines_list = ut.zipcompress(self.found_lines_list, flags_list) found_lxs_list = ut.zipcompress(self.found_lxs_list, flags_list) # found_fpath_list = ut.compress(self.found_fpath_list, flags) found_lines_list = ut.compress(found_lines_list, flags) found_lxs_list = ut.compress(found_lxs_list, flags) # In place modification self.found_fpath_list = found_fpath_list self.found_lines_list = found_lines_list self.found_lxs_list = found_lxs_list
def get_photobomber_map(ibs, aids, aid_to_nid=None): """ Builds map of which names that photobomb other names. python -m wbia.gui.id_review_api --test-test_review_widget --show --db PZ_MTEST -a default:qindex=0 >>> import wbia >>> dbdir = ut.truepath('~/lev/media/danger/GGR/GGR-IBEIS') >>> ibs = wbia.opendb(dbdir='/home/joncrall/lev/media/danger/GGR/GGR-IBEIS') >>> filter_kw = { >>> 'multiple': False, >>> 'minqual': 'good', >>> 'is_known': True, >>> 'min_pername': 2, >>> 'view': ['right'], >>> } >>> aids = ibs.filter_annots_general(ibs.get_valid_aids(), filter_kw=filter_kw) """ ams_list = ibs.get_annotmatch_rowids_from_aid(aids) flags_list = ibs.unflat_map( ut.partial(ibs.get_annotmatch_prop, 'Photobomb'), ams_list) pb_ams = ut.zipcompress(ams_list, flags_list) has_pb_ams = [len(ams) > 0 for ams in pb_ams] pb_ams_ = ut.compress(pb_ams, has_pb_ams) # aids_ = ut.compress(aids, has_pb_ams) pb_ams_flat = ut.flatten(pb_ams_) pb_aids1_ = ibs.get_annotmatch_aid1(pb_ams_flat) pb_aids2_ = ibs.get_annotmatch_aid2(pb_ams_flat) pb_aid_pairs_ = list(zip(pb_aids1_, pb_aids2_)) if aid_to_nid is None: pb_nid_pairs_ = ibs.unflat_map(ibs.get_annot_nids, pb_aid_pairs_) else: pb_nid_pairs_ = ibs.unflat_map(ut.partial(ut.take, aid_to_nid), pb_aid_pairs_) # invalid_aid_map = ut.ddict(set) # for aid1, aid2 in pb_aid_pairs_: # if aid1 != aid2: # invalid_aid_map[aid1].add(aid2) # invalid_aid_map[aid2].add(aid1) invalid_nid_map = ut.ddict(set) for nid1, nid2 in pb_nid_pairs_: if nid1 != nid2: invalid_nid_map[nid1].add(nid2) invalid_nid_map[nid2].add(nid1) return invalid_nid_map
def purge_ensure_one_annot_per_images(ibs): """ pip install Pipe """ # Purge all but one annotation images = ibs.images() # images.aids groups = images._annot_groups import numpy as np # Take all but the largest annotations per images large_masks = [ ut.index_to_boolmask([np.argmax(x)], len(x)) for x in groups.bbox_area ] small_masks = ut.lmap(ut.not_list, large_masks) # Remove all but the largets annotation small_aids = ut.zipcompress(groups.aid, small_masks) small_aids = ut.flatten(small_aids) # Fix any empty images images = ibs.images() empty_images = ut.where(np.array(images.num_annotations) == 0) logger.info('empty_images = %r' % (empty_images, )) # list(map(basename, map(dirname, images.uris_original))) def VecPipe(func): import pipe @pipe.Pipe def wrapped(sequence): return map(func, sequence) # return (None if item is None else func(item) for item in sequence) return wrapped name_list = list(images.uris_original | VecPipe(dirname) | VecPipe(basename)) aids_list = images.aids ut.assert_all_eq(list(aids_list | VecPipe(len))) annots = ibs.annots(ut.flatten(aids_list)) annots.names = name_list
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive,)) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes),)) #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),)) fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive, )) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend( list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes), )) #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list( map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [ key for key, val in multiindex_dict2_.items() if len(val) > 1 ] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes), )) fname_based_duplicate_didxs = ut.dict_take( multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take( dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))