Python find_duplicates Examples

Programming Language: Python

Namespace/Package Name: ubelt

Method/Function: find_duplicates

Examples at hotexamples.com: 9

Python find_duplicates - 9 examples found. These are the top rated real world Python examples of ubelt.find_duplicates extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def xval_splits(task):
        train = inputs.Inputs.from_paths(task.train_im_paths(), task.train_gt_paths(),
                                         tag='train')
        test = inputs.Inputs.from_paths(task.test_im_paths(), task.test_gt_paths(),
                                        tag='test')

        assert not bool(ub.find_duplicates(test.im_paths))
        assert not bool(ub.find_duplicates(train.im_paths))

        xval_split = (train, test)
        yield xval_split

Example #2

Show file

    def xval_splits(task, test_keys=None):
        import parse
        import logging
        parse.log.setLevel(logging.INFO)

        train_scene = 'JAX'
        test_scene = 'TAM'

        def primary_key_info(paths):
            infos = [parse.parse('{site_id}_Tile_{N}{junk}', p).named
                     for p in map(basename, paths)]
            df = pd.DataFrame(infos)
            return df

        train_inputs = []
        test_inputs = []

        # THESE PATHS MUST BE GENERATED IN THE SAME ORDER EACH TIME
        for k, v in sorted(task.input_modes.items()):
            df = primary_key_info(v.im_paths)
            parts = dict(list(df.groupby(['site_id'])))

            train_idx = parts[train_scene].index
            train_inputs.append(v.take(train_idx))

            if k in ['part-scale1']:
                test_idx = parts[test_scene].index
                test_inputs.append(v.take(test_idx))

        for k, v in sorted(task.augment_modes.items()):
            df = primary_key_info(v.im_paths)
            parts = dict(list(df.groupby(['site_id'])))
            train_idx = parts[train_scene].index
            train_inputs.append(v.take(train_idx))

        for v in train_inputs:
            assert not ub.find_duplicates(v.im_paths)
        for v in test_inputs:
            assert not ub.find_duplicates(v.im_paths)

        train = inputs.Inputs.union_all(*train_inputs)
        test = inputs.Inputs.union_all(*test_inputs)
        train.tag = 'train'
        test.tag = 'test'
        xval_split = (train, test)

        assert not bool(ub.find_duplicates(test.im_paths))
        assert not bool(ub.find_duplicates(train.im_paths))

        yield xval_split

Example #3

Show file

def access_cache():
    """
    0.24
    """
    def _walk(parent):
        for node in parent.nodes:
            item = (node.item, id(parent))
            yield item
        for child in parent.children:
            for item in _walk(child):
                yield item

    print('Access regions in {}'.format(sys.executable))
    self = ndsampler.CocoSampler.demo(verbose=0).regions

    # Set workdir to a special location
    self.workdir = ub.ensure_app_cache_dir('ndsampler', 'tests', '23_regions')
    print('self.workdir = {!r}'.format(self.workdir))
    print('self.hashid = {!r}'.format(self.hashid))

    self.verbose = 100
    isect_index = self.isect_index

    for gid, qtree in isect_index.qtrees.items():

        # items = sorted([item for item in _walk(qtree)])
        # print('items = {!r}'.format(items))
        # if ub.find_duplicates(items):
        #     raise Exception('DUPLICATE ITEM AIDS')

        box = [0, 0, qtree.width, qtree.height]
        isect_aids = qtree.intersect(box)
        print('isect_aids = {!r}'.format(isect_aids))
        if ub.find_duplicates(isect_aids):
            raise Exception('DUPLICATE AIDS')

        for aid, box in qtree.aid_to_tlbr.items():
            isect_aids = qtree.intersect(box)
            if ub.find_duplicates(isect_aids):
                raise Exception('DUPLICATE AIDS')
            # print('isect_aids = {!r}'.format(isect_aids))

        print('----')
        print('gid = {!r}'.format(gid))
        print('qtree = {!r}'.format(qtree))
        for node in qtree.nodes:
            print('node.item, node.rect = {!r}, {!r}'.format(
                node.item, node.rect))

Example #4

Show file

    def xval_splits(task, xval_method='predef', test_keys=None):
        """
        Generate the list of inputs in each test/train split.

        Currently does the simple thing which is train on all training
        data and test on all testing data. Logic exists for leave one out,
        but it is disabled.

        Yields:
            tuple(inputs.Inputs, inputs.Inputs): train / test inputs

        >>> (train_ims, train_gts), train = next(task.xval_splits())
        """
        # Parse the prepared data and prepare to split it into test / train
        task.create_groundtruth(force=False)
        scene_im_paths, scene_gt_paths = task._load_all_scene_paths()

        # Per scene xval generator
        def leave_k_out_xval(k=2):
            for test_scenes in ub.chunks(task.scene_ids, chunksize=k):
                # Simple leave one out
                train_scenes = list(task.scene_ids)
                for test_scene in test_scenes:
                    train_scenes.remove(test_scene)
                print('test_scenes = {!r}'.format(test_scenes))
                print('train_scenes = {!r}'.format(train_scenes))
                yield train_scenes, test_scenes

        def predef_single_xval():
            train_scenes, test_scenes = task.load_predefined_train_test()
            yield train_scenes, test_scenes

        if xval_method == 'predef':
            xval_iter = predef_single_xval()
        elif xval_method == 'k=2':
            xval_iter = leave_k_out_xval(k=2)
        else:
            raise KeyError(xval_method)

        train_keys = task._preprocessing_keys()
        if test_keys is None:
            test_keys = ['lowres']

        # Given a per scene split, map it to a split on an per-image basis
        flatten = it.chain.from_iterable
        for train_scenes, test_scenes in xval_iter:

            train_im_paths = list(
                flatten([
                    scene_im_paths[s][k]
                    for (s, k) in it.product(train_scenes, train_keys)
                ])) + task.extern_train_im_paths

            train_gt_paths = list(
                flatten([
                    scene_gt_paths[s][k]
                    for (s, k) in it.product(train_scenes, train_keys)
                ])) + task.extern_train_gt_paths

            test_im_paths = list(
                flatten([
                    scene_im_paths[s][k]
                    for (s, k) in it.product(test_scenes, test_keys)
                ]))
            test_gt_paths = list(
                flatten([
                    scene_gt_paths[s][k]
                    for (s, k) in it.product(test_scenes, test_keys)
                ]))

            train = inputs.Inputs.from_paths(train_im_paths,
                                             train_gt_paths,
                                             tag='train')
            test = inputs.Inputs.from_paths(test_im_paths,
                                            test_gt_paths,
                                            tag='test')

            assert not bool(ub.find_duplicates(test.im_paths))
            assert not bool(ub.find_duplicates(train.im_paths))

            xval_split = (train, test)
            yield xval_split

Example #5

Show file

File: config.py Project: Kitware/scriptconfig

    def argparse(self, parser=None, special_options=False):
        """
        construct or update an argparse.ArgumentParser CLI parser

        Args:
            parser (None | argparse.ArgumentParser): if specified this
                parser is updated with options from this config.

            special_options (bool, default=False):
                adds special scriptconfig options, namely: --config, --dumps,
                and --dump.

        Returns:
            argparse.ArgumentParser : a new or updated argument parser

        CommandLine:
            xdoctest -m scriptconfig.config Config.argparse:0
            xdoctest -m scriptconfig.config Config.argparse:1

        TODO:
            A good CLI spec for lists might be

            # In the case where ``key`` ends with and ``=``, assume the list is
            # given as a comma separated string with optional square brakets at
            # each end.

            --key=[f]

            # In the case where ``key`` does not end with equals and we know
            # the value is supposd to be a list, then we consume arguments
            # until we hit the next one that starts with '--' (which means
            # that list items cannot start with -- but they can contains
            # commas)

        FIXME:

            * In the case where we have an nargs='+' action, and we specify
              the option with an `=`, and then we give position args after it
              there is no way to modify behavior of the action to just look at
              the data in the string without modifying the ArgumentParser
              itself. The action object has no control over it. For example
              `--foo=bar baz biz` will parse as `[baz, biz]` which is really
              not what we want. We may be able to overload ArgumentParser to
              fix this.

        Example:
            >>> # You can now make instances of this class
            >>> import scriptconfig
            >>> self = scriptconfig.Config.demo()
            >>> parser = self.argparse()
            >>> parser.print_help()
            >>> # xdoctest: +REQUIRES(PY3)
            >>> # Python2 argparse does a hard sys.exit instead of raise
            >>> ns, extra = parser.parse_known_args()

        Example:
            >>> # You can now make instances of this class
            >>> import scriptconfig as scfg
            >>> class MyConfig(scfg.Config):
            >>>     description = 'my CLI description'
            >>>     default = {
            >>>         'path1':  scfg.Value(None, position=1, alias='src'),
            >>>         'path2':  scfg.Value(None, position=2, alias='dst'),
            >>>         'dry':  scfg.Value(False, isflag=True),
            >>>         'approx':  scfg.Value(False, isflag=False, alias=['a1', 'a2']),
            >>>     }
            >>> self = MyConfig()
            >>> special_options = True
            >>> parser = None
            >>> parser = self.argparse(special_options=special_options)
            >>> parser.print_help()
            >>> self._read_argv(argv=['objection', '42', '--path1=overruled!'])
            >>> print('self = {!r}'.format(self))

        Ignore:
            >>> self._read_argv(argv=['hi','--path1=foobar'])
            >>> self._read_argv(argv=['hi', 'hello', '--path1=foobar'])
            >>> self._read_argv(argv=['hi', 'hello', '--path1=foobar', '--help'])
            >>> self._read_argv(argv=['--path1=foobar', '--path1=baz'])
            >>> print('self = {!r}'.format(self))
        """
        import argparse

        if parser is None:
            parserkw = self._parserkw()
            parser = argparse.ArgumentParser(**parserkw)

        # Use custom action used to mark which values were explicitly set on
        # the commandline
        parser._explicitly_given = set()

        parent = self

        class ParseAction(argparse.Action):
            def __init__(self, *args, **kwargs):
                super(ParseAction, self).__init__(*args, **kwargs)
                # with script config nothing should be required by default all
                # positional arguments should have keyword arg variants Setting
                # required=False here will prevent positional args from
                # erroring if they are not specified. I dont think there are
                # other side effects, but we should make sure that is actually
                # the case.
                self.required = False

                if self.type is None:
                    # Is this the right place to put this?
                    def _mytype(value):
                        key = self.dest
                        template = parent.default[key]
                        if not isinstance(template, Value):
                            # smartcast non-valued params from commandline
                            value = smartcast.smartcast(value)
                        else:
                            value = template.cast(value)
                        return value

                    self.type = _mytype

                # print('self.type = {!r}'.format(self.type))

            def __call__(action, parser, namespace, values, option_string=None):
                # print('CALL action = {!r}'.format(action))
                # print('option_string = {!r}'.format(option_string))
                # print('values = {!r}'.format(values))

                if isinstance(values, list) and len(values):
                    # We got a list of lists, which we hack into a flat list
                    if isinstance(values[0], list):
                        import itertools as it
                        values = list(it.chain(*values))

                setattr(namespace, action.dest, values)
                parser._explicitly_given.add(action.dest)

        # IRC: this ensures each key has a real Value class
        _metadata = {
            key: self._data[key]
            for key, value in self._default.items()
            if isinstance(self._data[key], Value)
        }  # :type: Dict[str, Value]
        _positions = {k: v.position for k, v in _metadata.items()
                      if v.position is not None}
        if _positions:
            if ub.find_duplicates(_positions.values()):
                raise Exception('two values have the same position')
            _keyorder = ub.oset(ub.argsort(_positions))
            _keyorder |= (ub.oset(self._default) - _keyorder)
        else:
            _keyorder = list(self._default.keys())

        def _add_arg(parser, name, key, argkw, positional, isflag, isalias):
            _argkw = argkw.copy()

            if isalias:
                _argkw['help'] = 'alias of {}'.format(key)
                _argkw.pop('default', None)
                # flags cannot have flag aliases
                isflag = False

            elif positional:
                parser.add_argument(name, **_argkw)

            if isflag:
                # Can we support both flag and setitem methods of cli
                # parsing?
                if not isinstance(_argkw.get('default', None), bool):
                    raise ValueError('can only use isflag with bools')
                _argkw.pop('type', None)
                _argkw.pop('choices', None)
                _argkw.pop('action', None)
                _argkw.pop('nargs', None)
                _argkw['dest'] = key

                _argkw_true = _argkw.copy()
                _argkw_true['action'] = 'store_true'

                _argkw_false = _argkw.copy()
                _argkw_false['action'] = 'store_false'
                _argkw_false.pop('help', None)

                parser.add_argument('--' + name, **_argkw_true)
                parser.add_argument('--no-' + name, **_argkw_false)
            else:
                parser.add_argument('--' + name, **_argkw)

        mode = 1

        alias_registry = []
        for key, value in self._data.items():
            # key: str
            # value: Any | Value
            argkw = {}
            argkw['help'] = ''
            positional = None
            isflag = False
            if key in _metadata:
                # Use the metadata in the Value class to enhance argparse
                _value = _metadata[key]
                argkw.update(_value.parsekw)
                value = _value.value
                isflag = _value.isflag
                positional = _value.position
            else:
                _value = value if isinstance(value, Value) else None

            if not argkw['help']:
                argkw['help'] = '<undocumented>'

            argkw['default'] = value
            argkw['action'] = ParseAction

            name = key
            _add_arg(parser, name, key, argkw, positional, isflag, isalias=False)

            if _value is not None:
                if _value.alias:
                    alts = _value.alias
                    alts = alts if ub.iterable(alts) else [alts]
                    for alias in alts:
                        tup = (alias, key, argkw)
                        alias_registry.append(tup)
                        if mode == 0:
                            name = alias
                            _add_arg(parser, name, key, argkw, positional, isflag, isalias=True)

        if mode == 1:
            for tup in alias_registry:
                (alias, key, argkw) = tup
                name = alias
                dest = key
                _add_arg(parser, name, dest, argkw, positional, isflag, isalias=True)

        if special_options:
            parser.add_argument('--config', default=None, help=ub.codeblock(
                '''
                special scriptconfig option that accepts the path to a on-disk
                configuration file, and loads that into this {!r} object.
                ''').format(self.__class__.__name__))

            parser.add_argument('--dump', default=None, help=ub.codeblock(
                '''
                If specified, dump this config to disk.
                ''').format(self.__class__.__name__))

            parser.add_argument('--dumps', action='store_true', help=ub.codeblock(
                '''
                If specified, dump this config stdout
                ''').format(self.__class__.__name__))

        return parser

Example #6

Show file

def main(bib_fpath=None):
    r"""
    intro point to fixbib script

    CommmandLine:
        fixbib
        python -m fixtex bib
        python -m fixtex bib --dryrun
        python -m fixtex bib --dryrun --debug
    """

    if bib_fpath is None:
        bib_fpath = 'My Library.bib'

    # DEBUG = ub.argflag('--debug')
    # Read in text and ensure ascii format
    dirty_text = ut.readfrom(bib_fpath)

    from fixtex.fix_tex import find_used_citations, testdata_fpaths

    if exists('custom_extra.bib'):
        extra_parser = bparser.BibTexParser(ignore_nonstandard_types=False)
        parser = bparser.BibTexParser()
        ut.delete_keys(parser.alt_dict, ['url', 'urls'])
        print('Parsing extra bibtex file')
        extra_text = ut.readfrom('custom_extra.bib')
        extra_database = extra_parser.parse(extra_text, partial=False)
        print('Finished parsing extra')
        extra_dict = extra_database.get_entry_dict()
    else:
        extra_dict = None

    #udata = dirty_text.decode("utf-8")
    #dirty_text = udata.encode("ascii", "ignore")
    #dirty_text = udata

    # parser = bparser.BibTexParser()
    # bib_database = parser.parse(dirty_text)
    # d = bib_database.get_entry_dict()

    print('BIBTEXPARSER LOAD')
    parser = bparser.BibTexParser(ignore_nonstandard_types=False,
                                  common_strings=True)
    ut.delete_keys(parser.alt_dict, ['url', 'urls'])
    print('Parsing bibtex file')
    bib_database = parser.parse(dirty_text, partial=False)
    print('Finished parsing')

    bibtex_dict = bib_database.get_entry_dict()
    old_keys = list(bibtex_dict.keys())
    new_keys = []
    for key in ub.ProgIter(old_keys, label='fixing keys'):
        new_key = key
        new_key = new_key.replace(':', '')
        new_key = new_key.replace('-', '_')
        new_key = re.sub('__*', '_', new_key)
        new_keys.append(new_key)

    # assert len(ut.find_duplicate_items(new_keys)) == 0, 'new keys created conflict'
    assert len(ub.find_duplicates(new_keys)) == 0, 'new keys created conflict'

    for key, new_key in zip(old_keys, new_keys):
        if key != new_key:
            entry = bibtex_dict[key]
            entry['ID'] = new_key
            bibtex_dict[new_key] = entry
            del bibtex_dict[key]

    # The bibtext is now clean. Print it to stdout
    #print(clean_text)
    verbose = None
    if verbose is None:
        verbose = 1

    # Find citations from the tex documents
    key_list = None
    if key_list is None:
        cacher = ub.Cacher('texcite1', enabled=0)
        data = cacher.tryload()
        if data is None:
            fpaths = testdata_fpaths()
            key_list, inverse = find_used_citations(fpaths,
                                                    return_inverse=True)
            # ignore = ['JP', '?', 'hendrick']
            # for item in ignore:
            #     try:
            #         key_list.remove(item)
            #     except ValueError:
            #         pass
            if verbose:
                print('Found %d citations used in the document' %
                      (len(key_list), ))
            data = key_list, inverse
            cacher.save(data)
        key_list, inverse = data

    # else:
    #     key_list = None

    unknown_pubkeys = []
    debug_author = ub.argval('--debug-author', default=None)
    # ./fix_bib.py --debug_author=Kappes

    if verbose:
        print('Fixing %d/%d bibtex entries' %
              (len(key_list), len(bibtex_dict)))

    # debug = True
    debug = False
    if debug_author is not None:
        debug = False

    known_keys = list(bibtex_dict.keys())
    missing_keys = set(key_list) - set(known_keys)
    if extra_dict is not None:
        missing_keys.difference_update(set(extra_dict.keys()))

    if missing_keys:
        print('The library is missing keys found in tex files %s' %
              (ub.repr2(missing_keys), ))

    # Search for possible typos:
    candidate_typos = {}
    sedlines = []
    for key in missing_keys:
        candidates = ut.closet_words(key, known_keys, num=3, subset=True)
        if len(candidates) > 1:
            top = candidates[0]
            if ut.edit_distance(key, top) == 1:
                # "sed -i -e 's/{}/{}/g' *.tex".format(key, top)
                import os
                replpaths = ' '.join(
                    [relpath(p, os.getcwd()) for p in inverse[key]])
                sedlines.append("sed -i -e 's/{}/{}/g' {}".format(
                    key, top, replpaths))
        candidate_typos[key] = candidates
        print('Cannot find key = %r' % (key, ))
        print('Did you mean? %r' % (candidates, ))

    print('Quick fixes')
    print('\n'.join(sedlines))

    # group by file
    just = max([0] + list(map(len, missing_keys)))
    missing_fpaths = [inverse[key] for key in missing_keys]
    for fpath in sorted(set(ub.flatten(missing_fpaths))):
        # ut.fix_embed_globals()
        subkeys = [k for k in missing_keys if fpath in inverse[k]]
        print('')
        ut.cprint('--- Missing Keys ---', 'blue')
        ut.cprint('fpath = %r' % (fpath, ), 'blue')
        ut.cprint('{} | {}'.format('Missing'.ljust(just), 'Did you mean?'),
                  'blue')
        for key in subkeys:
            print('{} | {}'.format(ut.highlight_text(key.ljust(just), 'red'),
                                   ' '.join(candidate_typos[key])))

    # for key in list(bibtex_dict.keys()):

    if extra_dict is not None:
        # Extra database takes precidence over regular
        key_list = list(ut.unique(key_list + list(extra_dict.keys())))
        for k, v in extra_dict.items():
            bibtex_dict[k] = v

    full = ub.argflag('--full')

    for key in key_list:
        try:
            entry = bibtex_dict[key]
        except KeyError:
            continue
        self = BibTexCleaner(key, entry, full=full)

        if debug_author is not None:
            debug = debug_author in entry.get('author', '')

        if debug:
            ut.cprint(' --- ENTRY ---', 'yellow')
            print(ub.repr2(entry, nl=1))

        entry = self.fix()
        # self.clip_abstract()
        # self.shorten_keys()
        # self.fix_authors()
        # self.fix_year()
        # old_pubval = self.fix_pubkey()
        # if old_pubval:
        #     unknown_pubkeys.append(old_pubval)
        # self.fix_arxiv()
        # self.fix_general()
        # self.fix_paper_types()

        if debug:
            print(ub.repr2(entry, nl=1))
            ut.cprint(' --- END ENTRY ---', 'yellow')
        bibtex_dict[key] = entry

    unwanted_keys = set(bibtex_dict.keys()) - set(key_list)
    if verbose:
        print('Removing unwanted %d entries' % (len(unwanted_keys)))
    ut.delete_dict_keys(bibtex_dict, unwanted_keys)

    if 0:
        d1 = bibtex_dict.copy()
        full = True
        for key, entry in d1.items():
            self = BibTexCleaner(key, entry, full=full)
            pub = self.publication()
            if pub is None:
                print(self.entry['ENTRYTYPE'])

            old = self.fix_pubkey()
            x1 = self._pubval()
            x2 = self.standard_pubval(full=full)
            # if x2 is not None and len(x2) > 5:
            #     print(ub.repr2(self.entry))

            if x1 != x2:
                print('x2 = %r' % (x2, ))
                print('x1 = %r' % (x1, ))
                print(ub.repr2(self.entry))

            # if 'CVPR' in self.entry.get('booktitle', ''):
            #     if 'CVPR' != self.entry.get('booktitle', ''):
            #         break
            if old:
                print('old = %r' % (old, ))
            d1[key] = self.entry

    if full:
        d1 = bibtex_dict.copy()

        import numpy as np
        import pandas as pd
        df = pd.DataFrame.from_dict(d1, orient='index')

        paged_items = df[~pd.isnull(df['pub_accro'])]
        has_pages = ~pd.isnull(paged_items['pages'])
        print('have pages {} / {}'.format(has_pages.sum(), len(has_pages)))
        print(ub.repr2(paged_items[~has_pages]['title'].values.tolist()))

        entrytypes = dict(list(df.groupby('pub_type')))
        if False:
            # entrytypes['misc']
            g = entrytypes['online']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            entrytypes['book']
            entrytypes['thesis']
            g = entrytypes['article']
            g = entrytypes['incollection']
            g = entrytypes['conference']

        def lookup_pub(e):
            if e == 'article':
                return 'journal', 'journal'
            elif e == 'incollection':
                return 'booksection', 'booktitle'
            elif e == 'conference':
                return 'conference', 'booktitle'
            return None, None

        for e, g in entrytypes.items():
            print('e = %r' % (e, ))
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
            if 'pub_full' in g.columns:
                place_title = g['pub_full'].tolist()
                print(ub.repr2(ub.dict_hist(place_title)))
            else:
                print('Unknown publications')

        if 'report' in entrytypes:
            g = entrytypes['report']
            missing = g[pd.isnull(g['title'])]
            if len(missing):
                print('Missing Title')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'journal' in entrytypes:
            g = entrytypes['journal']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['journal'])]
            if len(missing):
                print('Missing Journal')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'conference' in entrytypes:
            g = entrytypes['conference']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['booktitle'])]
            if len(missing):
                print('Missing Booktitle')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'incollection' in entrytypes:
            g = entrytypes['incollection']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['booktitle'])]
            if len(missing):
                print('Missing Booktitle')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'thesis' in entrytypes:
            g = entrytypes['thesis']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
            missing = g[pd.isnull(g['institution'])]
            if len(missing):
                print('Missing Institution')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        # import utool
        # utool.embed()

    # Overwrite BibDatabase structure
    bib_database._entries_dict = bibtex_dict
    bib_database.entries = list(bibtex_dict.values())

    #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()}
    #print(ub.repr2(conftitle_to_types_set_hist))

    print('Unknown conference keys:')
    print(ub.repr2(sorted(unknown_pubkeys)))
    print('len(unknown_pubkeys) = %r' % (len(unknown_pubkeys), ))

    writer = BibTexWriter()
    writer.contents = ['comments', 'entries']
    writer.indent = '  '
    writer.order_entries_by = ('type', 'author', 'year')

    new_bibtex_str = bibtexparser.dumps(bib_database, writer)

    # Need to check
    #jegou_aggregating_2012

    # Fix the Journal Abreviations
    # References:
    # https://www.ieee.org/documents/trans_journal_names.pdf

    # Write out clean bibfile in ascii format
    clean_bib_fpath = ub.augpath(bib_fpath.replace(' ', '_'), suffix='_clean')

    if not ub.argflag('--dryrun'):
        ut.writeto(clean_bib_fpath, new_bibtex_str)

Example #7

Show file

def 数组_查找重复项(items, 至少出现=2):
    data = ub.find_duplicates(items, k=至少出现)
    return data

Example #8

Show file

def _devcheck_manage_snapshots(workdir, recent=5, factor=10, dry=True):
    """
    Sometimes netharn produces too many snapshots. The Monitor class attempts
    to prevent this, but its not perfect. So, sometimes you need to manually
    clean up. This code snippet serves as a template for doing so.

    I recommend using IPython to do this following this code as a guide.
    Unfortunately, I don't have a safe automated way of doing this yet.

    The basic code simply lists all snapshots that you have. Its then your job
    to find a huerstic to remove the ones you don't need.

    Note:
        # Idea for more automatic method

        In the future, we should use monitor to inspect the critical points of
        all metric curves and include any epoch that is at those cricial
        points. A cricial point is defined as one where there is a significant
        change in trajectory. Basically, we try to fit a low-degree polynomial
        or piecewise linear function to the metric curves, and we take the
        places where there is a significant change from a global perspective.

    # Specify your workdir
    workdir = ub.expandpath('~/work/voc_yolo2')
    """

    USE_RANGE_HUERISTIC = True

    run_dpath = join(workdir, 'fit', 'runs')
    snapshot_dpaths = list(
        glob.glob(join(run_dpath, '**/torch_snapshots'), recursive=True))
    print('checking {} snapshot paths'.format(len(snapshot_dpaths)))

    all_keep = []
    all_remove = []

    for snapshot_dpath in snapshot_dpaths:
        snapshots = sorted(glob.glob(join(snapshot_dpath, '_epoch_*.pt')))
        epoch_to_snap = {
            int(parse.parse('{}_epoch_{num:d}.pt', path).named['num']): path
            for path in snapshots
        }
        existing_epochs = sorted(epoch_to_snap.keys())
        # print('existing_epochs = {}'.format(ub.repr2(existing_epochs)))
        toremove = []
        tokeep = []

        if USE_RANGE_HUERISTIC:
            # My Critieron is that I'm only going to keep the two latest and
            # I'll also keep an epoch in the range [0,50], [50,100], and
            # [100,150], and so on.
            existing_epochs = sorted(existing_epochs)
            dups = ub.find_duplicates(np.array(sorted(existing_epochs)) //
                                      factor,
                                      k=0)
            keep_idxs = [max(idxs) for _, idxs in dups.items()]
            keep = set(ub.take(existing_epochs, keep_idxs))

            keep.update(existing_epochs[-recent:])

            if existing_epochs and existing_epochs[0] != 0:
                keep.update(existing_epochs[0:1])

            kill = []
            for epoch, path in epoch_to_snap.items():
                if epoch in keep:
                    tokeep.append(path)
                else:
                    kill.append(epoch)
                    toremove.append(path)
            # print('toremove = {!r}'.format(toremove))
            print('keep = {!r}'.format(sorted(keep)))
            print('kill = {!r}'.format(sorted(kill)))

        print('Keep {}/{} from {}'.format(len(keep), len(existing_epochs),
                                          snapshot_dpath))
        all_keep += [tokeep]
        all_remove += [toremove]

    # print('all_keep = {}'.format(ub.repr2(all_keep, nl=2)))
    # print('all_remove = {}'.format(ub.repr2(all_remove, nl=2)))
    """
    pip install send2trash
    import send2trash
    send2trash.send2trash(path)
    """
    total = 0
    for path in ub.flatten(all_remove):
        total += os.path.getsize(path)

    total_mb = total / 2**20
    if dry:
        print('Cleanup would delete {} snapshots and free {!r} MB'.format(
            len(all_remove), total_mb))
        print('Use -f to confirm and force cleanup')
    else:
        print('About to free {!r} MB'.format(total_mb))
        for path in ub.flatten(all_remove):
            ub.delete(path, verbose=True)

Example #9

Show file

File: name_scoring.py Project: simplesoftMX/ibeis

def compute_fmech_score(cm, qreq_=None, hack_single_ori=False):
    r"""
    nsum. This is the fmech scoring mechanism.


    Args:
        cm (ibeis.ChipMatch):

    Returns:
        tuple: (unique_nids, nsum_score_list)

    CommandLine:
        python -m ibeis.algo.hots.name_scoring --test-compute_fmech_score
        python -m ibeis.algo.hots.name_scoring --test-compute_fmech_score:0
        python -m ibeis.algo.hots.name_scoring --test-compute_fmech_score:2
        utprof.py -m ibeis.algo.hots.name_scoring --test-compute_fmech_score:2
        utprof.py -m ibeis.algo.hots.pipeline --test-request_ibeis_query_L0:0 --db PZ_Master1 -a timectrl:qindex=0:256

    Example0:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.name_scoring import *  # NOQA
        >>> cm = testdata_chipmatch()
        >>> nsum_score_list = compute_fmech_score(cm)
        >>> assert np.all(nsum_score_list == [ 4.,  7.,  5.])

    Example1:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.name_scoring import *  # NOQA
        >>> ibs, qreq_, cm_list = plh.testdata_post_sver('PZ_MTEST', qaid_list=[18])
        >>> cm = cm_list[0]
        >>> cm.evaluate_dnids(qreq_)
        >>> cm._cast_scores()
        >>> #cm.qnid = 1   # Hack for testdb1 names
        >>> nsum_score_list = compute_fmech_score(cm, qreq_)
        >>> #assert np.all(nsum_nid_list == cm.unique_nids), 'nids out of alignment'
        >>> flags = (cm.unique_nids == cm.qnid)
        >>> max_true = nsum_score_list[flags].max()
        >>> max_false = nsum_score_list[~flags].max()
        >>> assert max_true > max_false, 'is this truely a hard case?'
        >>> assert max_true > 1.2, 'score=%r should be higher for aid=18' % (max_true,)

    Example2:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.name_scoring import *  # NOQA
        >>> ibs, qreq_, cm_list = plh.testdata_post_sver('PZ_MTEST', qaid_list=[18], cfgdict=dict(query_rotation_heuristic=True))
        >>> cm = cm_list[0]
        >>> cm.score_name_nsum(qreq_)
        >>> ut.quit_if_noshow()
        >>> cm.show_ranked_matches(qreq_, ori=True)

    Example3:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.name_scoring import *  # NOQA
        >>> #ibs, qreq_, cm_list = plh.testdata_pre_sver('testdb1', qaid_list=[1])
        >>> ibs, qreq_, cm_list = plh.testdata_post_sver('testdb1', qaid_list=[1], cfgdict=dict(query_rotation_heuristic=True))
        >>> cm = cm_list[0]
        >>> cm.score_name_nsum(qreq_)
        >>> ut.quit_if_noshow()
        >>> cm.show_ranked_matches(qreq_, ori=True)
    """
    #assert qreq_ is not None
    if hack_single_ori is None:
        try:
            hack_single_ori = qreq_ is not None and (
                qreq_.qparams.query_rotation_heuristic
                or qreq_.qparams.rotation_invariance)
        except AttributeError:
            hack_single_ori = True
    # The core for each feature match
    #
    # The query feature index for each feature match
    fm_list = cm.fm_list
    fs_list = cm.get_fsv_prod_list()
    fx1_list = [fm.T[0] for fm in fm_list]
    if hack_single_ori:
        # Group keypoints with the same xy-coordinate.
        # Combine these feature so each only recieves one vote
        kpts1 = qreq_.ibs.get_annot_kpts(cm.qaid,
                                         config2_=qreq_.extern_query_config2)
        xys1_ = vt.get_xys(kpts1).T
        fx1_to_comboid = vt.compute_unique_arr_dataids(xys1_)
        fcombo_ids = [fx1_to_comboid.take(fx1) for fx1 in fx1_list]
    else:
        # use the feature index itself as a combo id
        # so each feature only recieves one vote
        fcombo_ids = fx1_list

    if False:
        import ubelt as ub
        for ids in fcombo_ids:
            ub.find_duplicates(ids)

    # Group annotation matches by name
    # nsum_nid_list, name_groupxs = vt.group_indices(cm.dnid_list)
    # nsum_nid_list = cm.unique_nids
    name_groupxs = cm.name_groupxs

    nsum_score_list = []
    # For all indicies matched to a particular name
    for name_idxs in name_groupxs:
        # Get feat indicies and scores corresponding to the name's annots
        name_combo_ids = ut.take(fcombo_ids, name_idxs)
        name_fss = ut.take(fs_list, name_idxs)
        # Flatten over annots in the name
        fs = np.hstack(name_fss)
        if len(fs) == 0:
            nsum_score_list.append(0)
            continue
        combo_ids = np.hstack(name_combo_ids)
        # Features (with the same id) can't vote for this name twice
        group_idxs = vt.group_indices(combo_ids)[1]
        flagged_idxs = [idxs[fs.take(idxs).argmax()] for idxs in group_idxs]
        # Detail: sorting the idxs preseveres summation order
        # this fixes the numerical issue where nsum and csum were off
        flagged_idxs = np.sort(flagged_idxs)
        name_score = fs.take(flagged_idxs).sum()

        nsum_score_list.append(name_score)
    nsum_score_list = np.array(nsum_score_list)

    return nsum_score_list