Exemple #1
0
 def inplace_filter_results(self, filter_pat):
     import utool as ut
     self.filter_pats.append(filter_pat)
     # Get zipflags
     flags_list = self.pattern_filterflags(filter_pat)
     # Check to see if there are any survivors
     flags = ut.lmap(any, flags_list)
     #
     found_lines_list = ut.zipcompress(self.found_lines_list, flags_list)
     found_lxs_list = ut.zipcompress(self.found_lxs_list, flags_list)
     #
     found_fpath_list = ut.compress(self.found_fpath_list, flags)
     found_lines_list = ut.compress(found_lines_list, flags)
     found_lxs_list = ut.compress(found_lxs_list, flags)
     # In place modification
     self.found_fpath_list = found_fpath_list
     self.found_lines_list = found_lines_list
     self.found_lxs_list = found_lxs_list
Exemple #2
0
 def inplace_filter_results(self, filter_pat):
     import utool as ut
     self.filter_pats.append(filter_pat)
     # Get zipflags
     flags_list = self.pattern_filterflags(filter_pat)
     # Check to see if there are any survivors
     flags = ut.lmap(any, flags_list)
     #
     found_lines_list = ut.zipcompress(self.found_lines_list, flags_list)
     found_lxs_list = ut.zipcompress(self.found_lxs_list, flags_list)
     #
     found_fpath_list = ut.compress(self.found_fpath_list, flags)
     found_lines_list = ut.compress(found_lines_list, flags)
     found_lxs_list = ut.compress(found_lxs_list, flags)
     # In place modification
     self.found_fpath_list = found_fpath_list
     self.found_lines_list = found_lines_list
     self.found_lxs_list = found_lxs_list
def get_photobomber_map(ibs, aids, aid_to_nid=None):
    """
    Builds map of which names that photobomb other names.

    python -m wbia.gui.id_review_api --test-test_review_widget --show --db PZ_MTEST -a default:qindex=0

    >>> import wbia
    >>> dbdir = ut.truepath('~/lev/media/danger/GGR/GGR-IBEIS')
    >>> ibs = wbia.opendb(dbdir='/home/joncrall/lev/media/danger/GGR/GGR-IBEIS')
    >>> filter_kw = {
    >>>     'multiple': False,
    >>>     'minqual': 'good',
    >>>     'is_known': True,
    >>>     'min_pername': 2,
    >>>     'view': ['right'],
    >>> }
    >>> aids = ibs.filter_annots_general(ibs.get_valid_aids(), filter_kw=filter_kw)
    """
    ams_list = ibs.get_annotmatch_rowids_from_aid(aids)
    flags_list = ibs.unflat_map(
        ut.partial(ibs.get_annotmatch_prop, 'Photobomb'), ams_list)
    pb_ams = ut.zipcompress(ams_list, flags_list)
    has_pb_ams = [len(ams) > 0 for ams in pb_ams]
    pb_ams_ = ut.compress(pb_ams, has_pb_ams)
    # aids_ = ut.compress(aids, has_pb_ams)
    pb_ams_flat = ut.flatten(pb_ams_)

    pb_aids1_ = ibs.get_annotmatch_aid1(pb_ams_flat)
    pb_aids2_ = ibs.get_annotmatch_aid2(pb_ams_flat)

    pb_aid_pairs_ = list(zip(pb_aids1_, pb_aids2_))
    if aid_to_nid is None:
        pb_nid_pairs_ = ibs.unflat_map(ibs.get_annot_nids, pb_aid_pairs_)
    else:
        pb_nid_pairs_ = ibs.unflat_map(ut.partial(ut.take, aid_to_nid),
                                       pb_aid_pairs_)

    # invalid_aid_map = ut.ddict(set)
    # for aid1, aid2 in pb_aid_pairs_:
    #    if aid1 != aid2:
    #        invalid_aid_map[aid1].add(aid2)
    #        invalid_aid_map[aid2].add(aid1)

    invalid_nid_map = ut.ddict(set)
    for nid1, nid2 in pb_nid_pairs_:
        if nid1 != nid2:
            invalid_nid_map[nid1].add(nid2)
            invalid_nid_map[nid2].add(nid1)

    return invalid_nid_map
def purge_ensure_one_annot_per_images(ibs):
    """
    pip install Pipe
    """
    # Purge all but one annotation
    images = ibs.images()
    # images.aids
    groups = images._annot_groups
    import numpy as np

    # Take all but the largest annotations per images
    large_masks = [
        ut.index_to_boolmask([np.argmax(x)], len(x)) for x in groups.bbox_area
    ]
    small_masks = ut.lmap(ut.not_list, large_masks)
    # Remove all but the largets annotation
    small_aids = ut.zipcompress(groups.aid, small_masks)
    small_aids = ut.flatten(small_aids)

    # Fix any empty images
    images = ibs.images()
    empty_images = ut.where(np.array(images.num_annotations) == 0)
    logger.info('empty_images = %r' % (empty_images, ))

    # list(map(basename, map(dirname, images.uris_original)))

    def VecPipe(func):
        import pipe

        @pipe.Pipe
        def wrapped(sequence):
            return map(func, sequence)
            # return (None if item is None else func(item) for item in sequence)

        return wrapped

    name_list = list(images.uris_original | VecPipe(dirname)
                     | VecPipe(basename))
    aids_list = images.aids
    ut.assert_all_eq(list(aids_list | VecPipe(len)))
    annots = ibs.annots(ut.flatten(aids_list))
    annots.names = name_list
Exemple #5
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive,))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_)
            if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes),))
        #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)
        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))
        unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1]
        print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),))
        fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Exemple #6
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive, ))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list,
                                           duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(
                    list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes), ))
        #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)

        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))

        unflat_fname_sets = list(
            map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [
            key for key, val in multiindex_dict2_.items() if len(val) > 1
        ]
        print('#fname_based_duplicate_dpaths = %r' %
              (len(fname_based_duplicate_hashes), ))
        fname_based_duplicate_didxs = ut.dict_take(
            multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(
            dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))