Beispiel #1
0
    def find_duplicates(index):
        # fpaths = list(index.files.keys())
        files = list(index.files.values())
        print('Grouping {} files'.format(len(files)))
        grouped = ut.group_items(files, [f.nbytes for f in files])
        print('Found {} groups'.format(len(grouped)))
        potential_dups = {k: v for k, v in grouped.items() if len(v) > 1}
        print('Found {} potential dups by nbytes'.format(len(potential_dups)))

        GB = 2**30  # NOQA
        MB = 2**20  # NOQA
        max_bytes = 10 * MB
        min_bytes = 64 * MB

        duplicates = []
        for k, fs in ut.ProgIter(potential_dups.items(), freq=1):
            names = [f.n for f in fs]
            if ut.allsame(names):
                # Don't do big files yet
                if k < max_bytes and k > min_bytes:
                    if ut.allsame([f.hashid for f in fs]):
                        duplicates.extend(fs)
                        for f1, f2 in ut.combinations(fs, 2):
                            f1.duplicates.add(f2)
                            f2.duplicates.add(f1)

        def dpath_similarity(index, dpath1, dpath2):
            d1 = index[dpath1]
            d2 = index[dpath2]
            set1 = {f.hashid for f in ut.ProgIter(d1.files)}
            set2 = {f.hashid for f in ut.ProgIter(d2.files)}
            # n_isect = len(set1.intersection(set2))
            size1, size2 = map(len, (set1, set2))
            # minsize = min(size1, size2)
            # sim_measures = (n_isect, n_isect / minsize)
            return ut.set_overlaps(set1, set2)
            # return sim_measures

        similarities = {}
        r_to_dup = ut.group_items(duplicates, [p.r for p in duplicates])
        for dpath, dups in r_to_dup.items():
            # Check to see if the duplicates all point to the same dir
            f = dups[0]  # NOQA
            common_dpath = set.intersection(*[{_.r
                                               for _ in f.duplicates}
                                              for f in dups])

            for other in common_dpath:
                sim_measures = dpath_similarity(index, dpath, other)
                similarities[(dpath, other)] = sim_measures

        print(ut.repr4(similarities, si=True, nl=2))
Beispiel #2
0
    def find_duplicates(index):
        # fpaths = list(index.files.keys())
        files = list(index.files.values())
        print('Grouping {} files'.format(len(files)))
        grouped = ut.group_items(files, [f.nbytes for f in files])
        print('Found {} groups'.format(len(grouped)))
        potential_dups = {k: v for k, v in grouped.items() if len(v) > 1}
        print('Found {} potential dups by nbytes'.format(len(potential_dups)))

        GB = 2 ** 30  # NOQA
        MB = 2 ** 20  # NOQA
        max_bytes = 10 * MB
        min_bytes = 64 * MB

        duplicates = []
        for k, fs in ut.ProgIter(potential_dups.items(), freq=1):
            names = [f.n for f in fs]
            if ut.allsame(names):
                # Don't do big files yet
                if k < max_bytes and k > min_bytes:
                    if ut.allsame([f.hashid for f in fs]):
                        duplicates.extend(fs)
                        for f1, f2 in ut.combinations(fs, 2):
                            f1.duplicates.add(f2)
                            f2.duplicates.add(f1)

        def dpath_similarity(index, dpath1, dpath2):
            d1 = index[dpath1]
            d2 = index[dpath2]
            set1 = {f.hashid for f in ut.ProgIter(d1.files)}
            set2 = {f.hashid for f in ut.ProgIter(d2.files)}
            # n_isect = len(set1.intersection(set2))
            size1, size2 = map(len, (set1, set2))
            # minsize = min(size1, size2)
            # sim_measures = (n_isect, n_isect / minsize)
            return ut.set_overlaps(set1, set2)
            # return sim_measures

        similarities = {}
        r_to_dup = ut.group_items(duplicates, [p.r for p in duplicates])
        for dpath, dups in r_to_dup.items():
            # Check to see if the duplicates all point to the same dir
            f = dups[0]  # NOQA
            common_dpath = set.intersection(*[
                {_.r for _ in f.duplicates} for f in dups])

            for other in common_dpath:
                sim_measures = dpath_similarity(index, dpath, other)
                similarities[(dpath, other)] = sim_measures

        print(ut.repr4(similarities, si=True, nl=2))
Beispiel #3
0
def group_daids_for_indexing_by_name(ibs, daid_list, num_indexers=8,
                                     verbose=True):
    """
    returns groups with only one annotation per name in each group
    """
    tup = ibs.group_annots_by_known_names(daid_list)
    aidgroup_list, invalid_aids = tup
    largest_groupsize = max(map(len, aidgroup_list))
    num_bins = min(largest_groupsize, num_indexers)
    if verbose or ut.VERYVERBOSE:
        print('[mindex] num_indexers = %d ' % (num_indexers,))
        print('[mindex] largest_groupsize = %d ' % (largest_groupsize,))
        print('[mindex] num_bins = %d ' % (num_bins,))
    # Group annotations for indexing according to the split criteria
    aids_list, overflow_aids = ut.sample_zip(
        aidgroup_list, num_bins, allow_overflow=True, per_bin=1)
    if __debug__:
        # All groups have the same name
        nidgroup_list = ibs.unflat_map(ibs.get_annot_name_rowids, aidgroup_list)
        for nidgroup in nidgroup_list:
            assert ut.allsame(nidgroup), 'bad name grouping'
    if __debug__:
        # All subsiquent indexer are subsets (in name/identity space)
        # of the previous
        nids_list = ibs.unflat_map(ibs.get_annot_name_rowids, aids_list)
        prev_ = None
        for nids in nids_list:
            if prev_ is None:
                prev_ = set(nids)
            else:
                assert prev_.issuperset(nids), 'bad indexer grouping'
    return aids_list, overflow_aids, num_bins
Beispiel #4
0
def group_daids_for_indexing_by_name(ibs, daid_list, num_indexers=8,
                                     verbose=True):
    """
    returns groups with only one annotation per name in each group
    """
    tup = ibs.group_annots_by_known_names(daid_list)
    aidgroup_list, invalid_aids = tup
    largest_groupsize = max(map(len, aidgroup_list))
    num_bins = min(largest_groupsize, num_indexers)
    if verbose or ut.VERYVERBOSE:
        print('[mindex] num_indexers = %d ' % (num_indexers,))
        print('[mindex] largest_groupsize = %d ' % (largest_groupsize,))
        print('[mindex] num_bins = %d ' % (num_bins,))
    # Group annotations for indexing according to the split criteria
    aids_list, overflow_aids = ut.sample_zip(
        aidgroup_list, num_bins, allow_overflow=True, per_bin=1)
    if __debug__:
        # All groups have the same name
        nidgroup_list = ibs.unflat_map(ibs.get_annot_name_rowids, aidgroup_list)
        for nidgroup in nidgroup_list:
            assert ut.allsame(nidgroup), 'bad name grouping'
    if __debug__:
        # All subsiquent indexer are subsets (in name/identity space)
        # of the previous
        nids_list = ibs.unflat_map(ibs.get_annot_name_rowids, aids_list)
        prev_ = None
        for nids in nids_list:
            if prev_ is None:
                prev_ = set(nids)
            else:
                assert prev_.issuperset(nids), 'bad indexer grouping'
    return aids_list, overflow_aids, num_bins
Beispiel #5
0
    def __init__(self, clf_list, voting='soft', weights=None):
        self.clf_list = clf_list
        self.voting = voting
        self.weights = None

        classes_list = [clf.classes_ for clf in clf_list]
        if ut.allsame(classes_list):
            self.classes_ = classes_list[0]
            self.class_idx_mappers = None
        else:
            # Need to make a mapper from individual clf classes to ensemble
            self.class_idx_mappers = []
            classes_ = sorted(set.union(*map(set, classes_list)))
            for clf in clf_list:
                # For each index of the clf classes, find that index in the
                # ensemble classes. Eg. class y=4 might be at cx=1 and ex=0
                mapper = np.empty(len(clf.classes_), dtype=np.int)
                for cx, y in enumerate(clf.classes_):
                    ex = classes_.index(y)
                    mapper[cx] = ex
                self.class_idx_mappers.append(mapper)
            self.classes_ = np.array(classes_)

        for clf in clf_list:
            clf.classes_
            pass
Beispiel #6
0
def get_varied_acfg_labels(acfg_list, mainkey='_cfgname', checkname=False):
    """
        >>> from ibeis.expt.annotation_configs import *  # NOQA

    """
    #print(ut.list_str(varied_acfg_list, nl=2))
    for acfg in acfg_list:
        assert acfg['qcfg'][mainkey] == acfg['dcfg'][mainkey], (
            'should be the same for now')
    cfgname_list = [acfg['qcfg'][mainkey] for acfg in acfg_list]
    if checkname and ut.allsame(cfgname_list):
        cfgname_list = [None] * len(cfgname_list)

    # Hack to make common params between q and d appear the same
    _acfg_list = [compress_aidcfg(acfg) for acfg in acfg_list]

    flat_acfg_list = flatten_acfg_list(_acfg_list)
    nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list(
        flat_acfg_list)

    SUPER_HACK = True
    if SUPER_HACK:
        # SUPER HACK, recompress remake the varied list after knowing what is varied
        _varied_keys = list(set(ut.flatten(
            [list(ut.flatten(
                [list(x.keys())
                 for x in unflatten_acfgdict(cfg).values()]
            )) for cfg in varied_acfg_list]
        )))
        _acfg_list = [
            compress_aidcfg(acfg, force_noncommon=_varied_keys)
            for acfg in acfg_list]
        flat_acfg_list = flatten_acfg_list(_acfg_list)
        nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list(
            flat_acfg_list)

    shortened_cfg_list = [
        #{shorten_to_alias_labels(key): val for key, val in _dict.items()}
        ut.map_dict_keys(shorten_to_alias_labels, _dict)
        for _dict in varied_acfg_list]
    nonlbl_keys = ut.INTERNAL_CFGKEYS
    nonlbl_keys = [prefix +  key for key in nonlbl_keys
                   for prefix in ['', 'q', 'd']]
    # hack for sorting by q/d stuff first

    def get_key_order(cfg):
        keys = [k for k in cfg.keys() if k not in nonlbl_keys]
        sortorder = [2 * k.startswith('q') + 1 * k.startswith('d')
                     for k in keys]
        return ut.sortedby(keys, sortorder)[::-1]

    cfglbl_list = [
        ut.get_cfg_lbl(cfg, name, nonlbl_keys, key_order=get_key_order(cfg))
        for cfg, name in zip(shortened_cfg_list, cfgname_list)]

    if checkname:
        cfglbl_list = [x.lstrip(':') for x in cfglbl_list]
    return cfglbl_list
Beispiel #7
0
def convert_cv2_images_to_theano_images(img_list):
    r"""
    Converts b01c to bc01

    Converts a list of cv2-style images into a single numpy array of nonflat
    theano-style images.

    h=height, w=width, b=batchid, c=channel

    Args:
        img_list (list of ndarrays): a list of numpy arrays with shape [h, w, c]

    Returns:
        data: in the shape [b, (c x h x w)]

    CommandLine:
        python -m ibeis_cnn.utils --test-convert_cv2_images_to_theano_images

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis_cnn.utils import *  # NOQA
        >>> import vtool as vt
        >>> # build test data
        >>> # execute function
        >>> img_list, width, height, channels = testdata_imglist()
        >>> data = convert_cv2_images_to_theano_images(img_list)
        >>> data[0].reshape(3, 32, 32)[:, 0:2, 0:2]
        >>> subset = (data[0].reshape(3, 32, 32)[:, 0:2, 0:2])
        >>> #result = str(np.transpose(subset, (1, 2, 0)))
        >>> result = str(subset).replace('\n', '')
        >>> print(result)
        [[[  0   3]  [ 96  99]] [[  1   4]  [ 97 100]] [[  2   5]  [ 98 101]]]
    """
    #[img.shape for img in img_list]
    # format to [b, c, h, w]
    if len(img_list.shape) == 3:
        # ensure 4 dimensions
        img_list = img_list.reshape(img_list.shape + (1, ))
    shape_list = [img.shape for img in img_list]
    assert ut.allsame(shape_list)
    theano_style_imgs = [
        np.transpose(img, (2, 0, 1))[None, :] for img in img_list
    ]
    data = np.vstack(theano_style_imgs)
    #data = np.vstack([img[None, :] for img in img_list])
    return data
Beispiel #8
0
    def measure_metrics(infr):
        real_pos_edges = []

        n_true_merges = infr.test_state['n_true_merges']
        confusion = infr.test_state['confusion']

        n_tp = confusion[POSTV][POSTV]
        confusion[POSTV]
        columns = set(confusion.keys())
        reviewd_cols = columns - {UNREV}
        non_postv = reviewd_cols - {POSTV}
        non_negtv = reviewd_cols - {NEGTV}

        n_fn = sum(ut.take(confusion[POSTV], non_postv))
        n_fp = sum(ut.take(confusion[NEGTV], non_negtv))

        n_error_edges = sum(confusion[r][c] + confusion[c][r] for r, c in
                            ut.combinations(reviewd_cols, 2))
        # assert n_fn + n_fp == n_error_edges

        pred_n_pcc_mst_edges = n_true_merges

        if 0:
            import ubelt as ub
            for timer in ub.Timerit(10):
                with timer:
                    # Find undetectable errors
                    num_undetectable_fn = 0
                    for nid1, nid2 in infr.neg_redun_metagraph.edges():
                        cc1 = infr.pos_graph.component(nid1)
                        cc2 = infr.pos_graph.component(nid2)
                        neg_edges = nxu.edges_cross(infr.neg_graph, cc1, cc2)
                        for u, v in neg_edges:
                            real_nid1 = infr.node_truth[u]
                            real_nid2 = infr.node_truth[v]
                            if real_nid1 == real_nid2:
                                num_undetectable_fn += 1
                                break

                    # Find undetectable errors
                    num_undetectable_fp = 0
                    for nid in infr.pos_redun_nids:
                        cc = infr.pos_graph.component(nid)
                        if not ut.allsame(ut.take(infr.node_truth, cc)):
                            num_undetectable_fp += 1

            print('num_undetectable_fn = %r' % (num_undetectable_fn,))
            print('num_undetectable_fp = %r' % (num_undetectable_fp,))

        if 0:
            n_error_edges2 = 0
            n_fn2 = 0
            n_fp2 = 0
            for edge, data in infr.edges(data=True):
                decision = data.get('evidence_decision', UNREV)
                true_state = infr.edge_truth[edge]
                if true_state == decision and true_state == POSTV:
                    real_pos_edges.append(edge)
                elif decision != UNREV:
                    if true_state != decision:
                        n_error_edges2 += 1
                        if true_state == POSTV:
                            n_fn2 += 1
                        elif true_state == NEGTV:
                            n_fp2 += 1
            assert n_error_edges2 == n_error_edges
            assert n_tp == len(real_pos_edges)
            assert n_fn == n_fn2
            assert n_fp == n_fp2
            # pred_n_pcc_mst_edges2 = sum(
            #     len(cc) - 1 for cc in infr.test_gt_pos_graph.connected_components()
            # )
        if False:
            import networkx as nx
            # set(infr.test_gt_pos_graph.edges()) == set(real_pos_edges)
            pred_n_pcc_mst_edges = 0
            for cc in nx.connected_components(nx.Graph(real_pos_edges)):
                pred_n_pcc_mst_edges += len(cc) - 1
            assert n_true_merges == pred_n_pcc_mst_edges

        # Find all annotations involved in a mistake
        assert n_error_edges == len(infr.mistake_edges)
        direct_mistake_aids = {a for edge in infr.mistake_edges for a in edge}
        mistake_nids = set(infr.node_labels(*direct_mistake_aids))
        mistake_aids = set(ut.flatten([infr.pos_graph.component(nid)
                                       for nid in mistake_nids]))

        pos_acc = pred_n_pcc_mst_edges / infr.real_n_pcc_mst_edges
        metrics = {
            'n_decision': infr.test_state['n_decision'],
            'n_manual': infr.test_state['n_manual'],
            'n_algo': infr.test_state['n_algo'],
            'phase': infr.loop_phase,
            'pos_acc': pos_acc,
            'n_merge_total': infr.real_n_pcc_mst_edges,
            'n_merge_remain': infr.real_n_pcc_mst_edges - n_true_merges,
            'n_true_merges': n_true_merges,
            'recovering': infr.is_recovering(),
            # 'recovering2': infr.test_state['recovering'],
            'merge_remain': 1 - pos_acc,
            'n_mistake_aids': len(mistake_aids),
            'frac_mistake_aids': len(mistake_aids) / len(infr.aids),
            'n_mistake_nids': len(mistake_nids),
            'n_errors': n_error_edges,
            'n_fn': n_fn,
            'n_fp': n_fp,
            'refresh_support': len(infr.refresh.manual_decisions),
            'pprob_any': infr.refresh.prob_any_remain(),
            'mu': infr.refresh._ewma,
            'test_action': infr.test_state['test_action'],
            'action': infr.test_state.get('action', None),
            'user_id': infr.test_state['user_id'],
            'pred_decision': infr.test_state['pred_decision'],
            'true_decision': infr.test_state['true_decision'],
            'n_neg_redun': infr.neg_redun_metagraph.number_of_edges(),
            'n_neg_redun1': (infr.neg_metagraph.number_of_edges() -
                             infr.neg_metagraph.number_of_selfloops()),
        }

        return metrics
Beispiel #9
0
def get_varied_acfg_labels(acfg_list, mainkey='_cfgname', checkname=False):
    """
        >>> from ibeis.expt.annotation_configs import *  # NOQA

    """
    #print(ut.list_str(varied_acfg_list, nl=2))
    for acfg in acfg_list:
        assert acfg['qcfg'][mainkey] == acfg['dcfg'][mainkey], (
            'should be the same for now')
    cfgname_list = [acfg['qcfg'][mainkey] for acfg in acfg_list]
    if checkname and ut.allsame(cfgname_list):
        cfgname_list = [None] * len(cfgname_list)

    # Hack to make common params between q and d appear the same
    _acfg_list = [compress_aidcfg(acfg) for acfg in acfg_list]

    flat_acfg_list = flatten_acfg_list(_acfg_list)
    nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list(
        flat_acfg_list)

    SUPER_HACK = True
    if SUPER_HACK:
        # SUPER HACK, recompress remake the varied list after knowing what is varied
        _varied_keys = list(
            set(
                ut.flatten([
                    list(
                        ut.flatten([
                            list(x.keys())
                            for x in unflatten_acfgdict(cfg).values()
                        ])) for cfg in varied_acfg_list
                ])))
        _acfg_list = [
            compress_aidcfg(acfg, force_noncommon=_varied_keys)
            for acfg in acfg_list
        ]
        flat_acfg_list = flatten_acfg_list(_acfg_list)
        nonvaried_dict, varied_acfg_list = ut.partition_varied_cfg_list(
            flat_acfg_list)

    shortened_cfg_list = [
        #{shorten_to_alias_labels(key): val for key, val in _dict.items()}
        ut.map_dict_keys(shorten_to_alias_labels, _dict)
        for _dict in varied_acfg_list
    ]
    nonlbl_keys = ut.INTERNAL_CFGKEYS
    nonlbl_keys = [
        prefix + key for key in nonlbl_keys for prefix in ['', 'q', 'd']
    ]

    # hack for sorting by q/d stuff first

    def get_key_order(cfg):
        keys = [k for k in cfg.keys() if k not in nonlbl_keys]
        sortorder = [
            2 * k.startswith('q') + 1 * k.startswith('d') for k in keys
        ]
        return ut.sortedby(keys, sortorder)[::-1]

    cfglbl_list = [
        ut.get_cfg_lbl(cfg, name, nonlbl_keys, key_order=get_key_order(cfg))
        for cfg, name in zip(shortened_cfg_list, cfgname_list)
    ]

    if checkname:
        cfglbl_list = [x.lstrip(':') for x in cfglbl_list]
    return cfglbl_list
Beispiel #10
0
def exec_interactive_incremental_queries(ibs, qaid_list, back=None):
    assert ut.allsame(ibs.get_annot_species_rowids(qaid_list)), "must be all on same species"
    self = IncQueryHarness()
    self = self.begin_incremental_query(ibs, qaid_list, back=back)
    def expand(sample, denc_per_name=[1], extra_dbsize_fracs=[0]):
        # Vary the number of database encounters in each sample
        target_daids_list = []
        target_info_list_ = []
        for num in denc_per_name:
            dname_encs_ = ut.take_column(sample.dname_encs, slice(0, num))
            dnames_ = ut.lmap(ut.flatten, dname_encs_)
            daids_ = ut.total_flatten(dname_encs_)
            target_daids_list.append(daids_)
            name_lens = ut.lmap(len, dnames_)
            dpername = name_lens[0] if ut.allsame(name_lens) else np.mean(
                name_lens)
            target_info_list_.append(
                ut.odict([
                    ('qsize', len(sample.qaids)),
                    ('t_n_names', len(dname_encs_)),
                    ('t_dpername', dpername),
                    ('t_denc_pername', num),
                    ('t_dsize', len(daids_)),
                ]))

        # Append confusors to maintain a constant dbsize in each base sample
        dbsize_list = ut.lmap(len, target_daids_list)
        max_dsize = max(dbsize_list)
        n_need = max_dsize - min(dbsize_list)
        n_extra_avail = len(sample.confusor_pool) - n_need
        assert len(sample.confusor_pool) > n_need, 'not enough confusors'
        padded_daids_list = []
        padded_info_list_ = []
        for daids_, info_ in zip(target_daids_list, target_info_list_):
            num_take = max_dsize - len(daids_)
            pad_aids = sample.confusor_pool[:num_take]
            new_aids = daids_ + pad_aids
            info_ = info_.copy()
            info_['n_pad'] = len(pad_aids)
            info_['pad_dsize'] = len(new_aids)
            padded_info_list_.append(info_)
            padded_daids_list.append(new_aids)

        # Vary the dbsize by appending extra confusors
        if extra_dbsize_fracs is None:
            extra_dbsize_fracs = [1.0]
        extra_fracs = np.array(extra_dbsize_fracs)
        n_extra_list = np.unique(extra_fracs * n_extra_avail).astype(np.int)
        daids_list = []
        info_list = []
        for n in n_extra_list:
            for daids_, info_ in zip(padded_daids_list, padded_info_list_):
                extra_aids = sample.confusor_pool[len(sample.confusor_pool) -
                                                  n:]
                daids = sorted(daids_ + extra_aids)
                daids_list.append(daids)
                info = info_.copy()
                info['n_extra'] = len(extra_aids)
                info['dsize'] = len(daids)
                info_list.append(info)

        import pandas as pd

        verbose = 0
        if verbose:
            logger.info(pd.DataFrame.from_records(info_list))
            logger.info('#qaids = %r' % (len(sample.qaids), ))
            logger.info('num_need = %r' % (n_need, ))
            logger.info('max_dsize = %r' % (max_dsize, ))
        return sample.qaids, daids_list, info_list
Beispiel #12
0
def compute_residual_assignments(depc, fid_list, vocab_id_list, config):
    r"""
    CommandLine:
        python -m ibeis.control.IBEISControl show_depc_annot_table_input \
                --show --tablename=residuals

    Ignore:
        ibs.depc['vocab'].print_table()

    Ignore:
        data = ibs.depc.get('inverted_agg_assign', ([1, 2473], qreq_.daids), config=qreq_.config)
        wxs1 = data[0][0]
        wxs2 = data[1][0]

        # Lev Example
        import ibeis
        ibs = ibeis.opendb('Oxford')
        depc = ibs.depc
        table = depc['inverted_agg_assign']
        table.print_table()
        table.print_internal_info()

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.smk.inverted_index import *  # NOQA
        >>> # Test depcache access
        >>> import ibeis
        >>> ibs, aid_list = ibeis.testdata_aids('testdb1')
        >>> depc = ibs.depc_annot
        >>> config = {'num_words': 1000, 'nAssign': 1}
        >>> #input_tuple = (aid_list, [aid_list] * len(aid_list))
        >>> daids = aid_list
        >>> input_tuple = (daids, [daids])
        >>> rowid_kw = {}
        >>> tablename = 'inverted_agg_assign'
        >>> target_tablename = tablename
        >>> input_ids = depc.get_parent_rowids(tablename, input_tuple, config)
        >>> fid_list = ut.take_column(input_ids, 0)
        >>> vocab_id_list = ut.take_column(input_ids, 1)
        >>> data = depc.get(tablename, input_tuple, config)
        >>> tup = dat[1]

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.smk.inverted_index import *  # NOQA
        >>> import ibeis
        >>> qreq_ = ibeis.testdata_qreq_(defaultdb='Oxford', a='oxford', p='default:proot=smk,nAssign=1,num_words=64000')
        >>> config = {'num_words': 64000, 'nAssign': 1, 'int_rvec': True}
        >>> depc = qreq_.ibs.depc
        >>> daids = qreq_.daids
        >>> input_tuple = (daids, [daids])
        >>> rowid_kw = {}
        >>> tablename = 'inverted_agg_assign'
        >>> target_tablename = tablename
        >>> input_ids = depc.get_parent_rowids(tablename, input_tuple, config)
        >>> fid_list = ut.take_column(input_ids, 0)
        >>> vocab_id_list = ut.take_column(input_ids, 1)
    """
    #print('[IBEIS] ASSIGN RESIDUALS:')
    assert ut.allsame(vocab_id_list)
    vocabid = vocab_id_list[0]

    # NEED HACK TO NOT LOAD INDEXER EVERY TIME
    this_table = depc['inverted_agg_assign']
    vocab_table = depc['vocab']
    if this_table._hack_chunk_cache is not None and vocabid in this_table._hack_chunk_cache:
        vocab = this_table._hack_chunk_cache[vocabid]
    else:
        vocab = vocab_table.get_row_data([vocabid], 'words')[0]
        if this_table._hack_chunk_cache is not None:
            this_table._hack_chunk_cache[vocabid] = vocab

    print('Grab Vecs')
    vecs_list = depc.get_native('feat', fid_list, 'vecs')
    nAssign = config['nAssign']
    int_rvec = config['int_rvec']

    from concurrent import futures
    print('Building residual args')
    worker = residual_worker
    args_gen = gen_residual_args(vocab, vecs_list, nAssign, int_rvec)
    args_gen = [
        args for args in ut.ProgIter(
            args_gen, length=len(vecs_list), lbl='building args')
    ]
    # nprocs = ut.num_unused_cpus(thresh=10) - 1
    nprocs = ut.num_cpus()
    print('Creating %d processes' % (nprocs, ))
    executor = futures.ProcessPoolExecutor(nprocs)
    try:
        print('Submiting workers')
        fs_chunk = [
            executor.submit(worker, args)
            for args in ut.ProgIter(args_gen, lbl='submit proc')
        ]
        for fs in ut.ProgIter(fs_chunk, lbl='getting phi result'):
            tup = fs.result()
            yield tup
    except Exception:
        raise
    finally:
        executor.shutdown(wait=True)
 def are_nodes_connected(self, u, v):
     return ut.allsame(self.node_labels(u, v))
Beispiel #14
0
def understanding_pseudomax_props(mode=2):
    """
    Function showing some properties of distances between normalized pseudomax vectors

    CommandLine:
        python -m vtool.distance --test-understanding_pseudomax_props

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.distance import *  # NOQA
        >>> for mode in [0, 1, 2, 3]:
        ...     print('+---')
        ...     print('mode = %r' % (mode,))
        ...     result = understanding_pseudomax_props(mode)
        ...     print('L___')
        >>> print(result)
    """
    import vtool as vt
    pseudo_max = 512
    rng = np.random.RandomState(0)
    num = 10
    if mode == 0:
        dim = 2
        p1_01 = (vt.normalize_rows(rng.rand(num, dim)))
        p2_01 = (vt.normalize_rows(rng.rand(num, dim)))
    elif mode == 1:
        p1_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max
        p2_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max
    elif mode == 2:
        # Build theoretically maximally distant normalized vectors (type 1)
        dim = 128
        p1_01 = np.zeros((1, dim))
        p2_01 = np.zeros((1, dim))
        p2_01[:, 0::2] = 1
        p1_01[:, 1::2] = 1
        p1_01 = vt.normalize_rows(p1_01)
        p2_01 = vt.normalize_rows(p2_01)
    elif mode == 3:
        # Build theoretically maximally distant vectors (type 2)
        # This mode will clip if cast to uint8, thus failing the test
        dim = 128
        p1_01 = np.zeros((1, dim))
        p2_01 = np.zeros((1, dim))
        p2_01[:, 0] = 1
        p1_01[:, 1:] = 1
        p1_01 = vt.normalize_rows(p1_01)
        p2_01 = vt.normalize_rows(p2_01)
        pass
    print('ndims = %r' % (p1_01.shape[1]))

    p1_01 = p1_01.astype(TEMP_VEC_DTYPE)
    p2_01 = p2_01.astype(TEMP_VEC_DTYPE)

    p1_256 = p1_01 * pseudo_max
    p2_256 = p2_01 * pseudo_max

    dist_sqrd_01 = vt.L2_sqrd(p1_01, p2_01)
    dist_sqrd_256 = vt.L2_sqrd(p1_256, p2_256)

    dist_01 = np.sqrt(dist_sqrd_01)
    dist_256 = np.sqrt(dist_sqrd_256)

    print('dist_sqrd_01  = %s' % (ut.numpy_str(dist_sqrd_01, precision=2), ))
    print('dist_sqrd_256 = %s' % (ut.numpy_str(dist_sqrd_256, precision=2), ))
    print('dist_01       = %s' % (ut.numpy_str(dist_01, precision=2), ))
    print('dist_256      = %s' % (ut.numpy_str(dist_256, precision=2), ))

    print('--')
    print('sqrt(2)       = %f' % (np.sqrt(2)))
    print('--')

    assert np.all(dist_01 == vt.L2(p1_01, p2_01))
    assert np.all(dist_256 == vt.L2(p1_256, p2_256))

    const_sqrd = dist_sqrd_256 / dist_sqrd_01
    const = dist_256 / dist_01

    print('const = %r' % (const[0], ))
    print('const_sqrd = %r' % (const_sqrd[0], ))
    print('1 / const = %r' % (1 / const[0], ))
    print('1 / const_sqrd = %r' % (1 / const_sqrd[0], ))

    assert ut.allsame(const)
    assert ut.allsame(const_sqrd)

    assert np.all(const == np.sqrt(const_sqrd))

    # Assert that distance conversions work
    assert np.all(dist_256 / const == dist_01)
    assert np.all(dist_sqrd_256 / const_sqrd == dist_sqrd_01)
    print('Conversions work')

    print('Maximal L2 distance between any two NON-NEGATIVE L2-NORMALIZED'
          ' vectors should always be sqrt(2)')
Beispiel #15
0
def understanding_pseudomax_props(mode=2):
    """
    Function showing some properties of distances between normalized pseudomax vectors

    CommandLine:
        python -m vtool.distance --test-understanding_pseudomax_props

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.distance import *  # NOQA
        >>> for mode in [0, 1, 2, 3]:
        ...     print('+---')
        ...     print('mode = %r' % (mode,))
        ...     result = understanding_pseudomax_props(mode)
        ...     print('L___')
        >>> print(result)
    """
    import vtool as vt
    pseudo_max = 512
    rng = np.random.RandomState(0)
    num = 10
    if mode == 0:
        dim = 2
        p1_01 = (vt.normalize_rows(rng.rand(num, dim)))
        p2_01 = (vt.normalize_rows(rng.rand(num, dim)))
    elif mode == 1:
        p1_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max
        p2_01 = vt.dummy.testdata_dummy_sift(num, rng) / pseudo_max
    elif mode == 2:
        # Build theoretically maximally distant normalized vectors (type 1)
        dim = 128
        p1_01 = np.zeros((1, dim))
        p2_01 = np.zeros((1, dim))
        p2_01[:, 0::2] = 1
        p1_01[:, 1::2] = 1
        p1_01 = vt.normalize_rows(p1_01)
        p2_01 = vt.normalize_rows(p2_01)
    elif mode == 3:
        # Build theoretically maximally distant vectors (type 2)
        # This mode will clip if cast to uint8, thus failing the test
        dim = 128
        p1_01 = np.zeros((1, dim))
        p2_01 = np.zeros((1, dim))
        p2_01[:, 0] = 1
        p1_01[:, 1:] = 1
        p1_01 = vt.normalize_rows(p1_01)
        p2_01 = vt.normalize_rows(p2_01)
        pass
    print('ndims = %r' % (p1_01.shape[1]))

    p1_01 = p1_01.astype(TEMP_VEC_DTYPE)
    p2_01 = p2_01.astype(TEMP_VEC_DTYPE)

    p1_256 = p1_01 * pseudo_max
    p2_256 = p2_01 * pseudo_max

    dist_sqrd_01 = vt.L2_sqrd(p1_01, p2_01)
    dist_sqrd_256 = vt.L2_sqrd(p1_256, p2_256)

    dist_01 = np.sqrt(dist_sqrd_01)
    dist_256 = np.sqrt(dist_sqrd_256)

    print('dist_sqrd_01  = %s' % (ut.numpy_str(dist_sqrd_01, precision=2),))
    print('dist_sqrd_256 = %s' % (ut.numpy_str(dist_sqrd_256, precision=2),))
    print('dist_01       = %s' % (ut.numpy_str(dist_01, precision=2),))
    print('dist_256      = %s' % (ut.numpy_str(dist_256, precision=2),))

    print('--')
    print('sqrt(2)       = %f' % (np.sqrt(2)))
    print('--')

    assert np.all(dist_01 == vt.L2(p1_01, p2_01))
    assert np.all(dist_256 == vt.L2(p1_256, p2_256))

    const_sqrd = dist_sqrd_256 / dist_sqrd_01
    const = dist_256 / dist_01

    print('const = %r' % (const[0],))
    print('const_sqrd = %r' % (const_sqrd[0],))
    print('1 / const = %r' % (1 / const[0],))
    print('1 / const_sqrd = %r' % (1 / const_sqrd[0],))

    assert ut.allsame(const)
    assert ut.allsame(const_sqrd)

    assert np.all(const == np.sqrt(const_sqrd))

    # Assert that distance conversions work
    assert np.all(dist_256 / const == dist_01)
    assert np.all(dist_sqrd_256 / const_sqrd == dist_sqrd_01)
    print('Conversions work')

    print('Maximal L2 distance between any two NON-NEGATIVE L2-NORMALIZED'
          ' vectors should always be sqrt(2)')
Beispiel #16
0
    def new_cpd(self, parents=None, pmf_func=None):
        """
        Makes a new random variable that is an instance of this tempalte

        parents : only used to define the name of this node.
        """
        if pmf_func is None:
            pmf_func = self.pmf_func

        # --- MAKE VARIABLE ID
        def _getid(obj):
            if isinstance(obj, int):
                return str(obj)
            elif isinstance(obj, six.string_types):
                return obj
            else:
                return obj._template_id

        if not ut.isiterable(parents):
            parents = [parents]

        template_ids = [_getid(cpd) for cpd in parents]
        HACK_SAME_IDS = True
        # TODO: keep track of parent index inheritence
        # then rectify uniqueness based on that
        if HACK_SAME_IDS and ut.allsame(template_ids):
            _id = template_ids[0]
        else:
            _id = ''.join(template_ids)
        variable = ''.join([self.varpref, _id])
        # variable = '_'.join([self.varpref, '{' + _id + '}'])
        # variable = '$%s$' % (variable,)

        evidence_cpds = [cpd for cpd in parents if hasattr(cpd, 'ttype')]
        if len(evidence_cpds) == 0:
            evidence_cpds = None

        variable_card = len(self.basis)
        statename_dict = {
            variable: self.basis,
        }
        if self.evidence_ttypes is not None:
            if any(cpd.ttype != tcpd.ttype
                   for cpd, tcpd in zip(evidence_cpds, evidence_cpds)):
                raise ValueError('Evidence is not of appropriate type')
            evidence_bases = [cpd.variable_statenames for cpd in evidence_cpds]
            evidence_card = list(map(len, evidence_bases))
            evidence_states = list(ut.iprod(*evidence_bases))

            for cpd in evidence_cpds:
                _dict = ut.dict_subset(cpd.statename_dict, [cpd.variable])
                statename_dict.update(_dict)

            evidence = [cpd.variable for cpd in evidence_cpds]
        else:
            if evidence_cpds is not None:
                raise ValueError('Gave evidence for evidence-less template')
            evidence = None
            evidence_card = None

        # --- MAKE TABLE VALUES
        if pmf_func is not None:
            if isinstance(pmf_func, list):
                values = np.array(pmf_func)
            else:
                values = np.array([[
                    pmf_func(vstate, *estates) for estates in evidence_states
                ] for vstate in self.basis])
            ensure_normalized = True
            if ensure_normalized:
                values = values / values.sum(axis=0)
        else:
            # assume uniform
            fill_value = 1.0 / variable_card
            if evidence_card is None:
                values = np.full((1, variable_card), fill_value)
            else:
                values = np.full([variable_card] + list(evidence_card),
                                 fill_value)

        try:
            cpd = pgmpy.factors.TabularCPD(
                variable=variable,
                variable_card=variable_card,
                values=values,
                evidence=evidence,
                evidence_card=evidence_card,
                # statename_dict=statename_dict,
                state_names=statename_dict,
            )
        except Exception as ex:
            ut.printex(
                ex,
                'Failed to create TabularCPD',
                keys=[
                    'variable',
                    'variable_card',
                    'statename_dict',
                    'evidence_card',
                    'evidence',
                    'values.shape',
                ],
            )
            ut.embed()
            raise

        cpd.ttype = self.ttype
        cpd._template_ = self
        cpd._template_id = _id
        return cpd
Beispiel #17
0
    def new_cpd(self, parents=None, pmf_func=None):
        """
        Makes a new random variable that is an instance of this tempalte

        parents : only used to define the name of this node.
        """
        if pmf_func is None:
            pmf_func = self.pmf_func

        # --- MAKE VARIABLE ID
        def _getid(obj):
            if isinstance(obj, int):
                return str(obj)
            elif isinstance(obj, six.string_types):
                return obj
            else:
                return obj._template_id

        if not ut.isiterable(parents):
            parents = [parents]

        template_ids = [_getid(cpd) for cpd in parents]
        HACK_SAME_IDS = True
        # TODO: keep track of parent index inheritence
        # then rectify uniqueness based on that
        if HACK_SAME_IDS and ut.allsame(template_ids):
            _id = template_ids[0]
        else:
            _id = ''.join(template_ids)
        variable = ''.join([self.varpref, _id])
        #variable = '_'.join([self.varpref, '{' + _id + '}'])
        #variable = '$%s$' % (variable,)

        evidence_cpds = [cpd for cpd in parents if hasattr(cpd, 'ttype')]
        if len(evidence_cpds) == 0:
            evidence_cpds = None

        variable_card = len(self.basis)
        statename_dict = {
            variable: self.basis,
        }
        if self.evidence_ttypes is not None:
            if any(cpd.ttype != tcpd.ttype
                   for cpd, tcpd in zip(evidence_cpds, evidence_cpds)):
                raise ValueError('Evidence is not of appropriate type')
            evidence_bases = [cpd.variable_statenames for cpd in evidence_cpds]
            evidence_card = list(map(len, evidence_bases))
            evidence_states = list(ut.iprod(*evidence_bases))

            for cpd in evidence_cpds:
                _dict = ut.dict_subset(cpd.statename_dict, [cpd.variable])
                statename_dict.update(_dict)

            evidence = [cpd.variable for cpd in evidence_cpds]
        else:
            if evidence_cpds is not None:
                raise ValueError('Gave evidence for evidence-less template')
            evidence = None
            evidence_card = None

        # --- MAKE TABLE VALUES
        if pmf_func is not None:
            if isinstance(pmf_func, list):
                values = np.array(pmf_func)
            else:
                values = np.array([
                    [pmf_func(vstate, *estates) for estates in evidence_states]
                    for vstate in self.basis
                ])
            ensure_normalized = True
            if ensure_normalized:
                values = values / values.sum(axis=0)
        else:
            # assume uniform
            fill_value = 1.0 / variable_card
            if evidence_card is None:
                values = np.full((1, variable_card), fill_value)
            else:
                values = np.full([variable_card] + list(evidence_card), fill_value)

        try:
            cpd = pgmpy.factors.TabularCPD(
                variable=variable,
                variable_card=variable_card,
                values=values,
                evidence=evidence,
                evidence_card=evidence_card,
                #statename_dict=statename_dict,
                state_names=statename_dict,
            )
        except Exception as ex:
            ut.printex(ex, 'Failed to create TabularCPD',
                       keys=[
                           'variable',
                           'variable_card',
                           'statename_dict',
                           'evidence_card',
                           'evidence',
                           'values.shape',
                       ])
            ut.embed()
            raise

        cpd.ttype = self.ttype
        cpd._template_ = self
        cpd._template_id = _id
        return cpd
Beispiel #18
0
def demo2():
    """
    CommandLine:
        python -m wbia.algo.graph.demo demo2 --viz
        python -m wbia.algo.graph.demo demo2

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.graph.demo import *  # NOQA
        >>> result = demo2()
        >>> print(result)
    """
    import wbia.plottool as pt

    from wbia.scripts.thesis import TMP_RC
    import matplotlib as mpl

    mpl.rcParams.update(TMP_RC)

    # ---- Synthetic data params
    params = {
        'redun.pos': 2,
        'redun.neg': 2,
    }
    # oracle_accuracy = .98
    # oracle_accuracy = .90
    # oracle_accuracy = (.8, 1.0)
    oracle_accuracy = (0.85, 1.0)
    # oracle_accuracy = 1.0

    # --- draw params

    VISUALIZE = ut.get_argflag('--viz')
    # QUIT_OR_EMEBED = 'embed'
    QUIT_OR_EMEBED = 'quit'
    TARGET_REVIEW = ut.get_argval('--target', type_=int, default=None)
    START = ut.get_argval('--start', type_=int, default=None)
    END = ut.get_argval('--end', type_=int, default=None)

    # ------------------

    # rng = np.random.RandomState(42)

    # infr = demodata_infr(num_pccs=4, size=3, size_std=1, p_incon=0)
    # infr = demodata_infr(num_pccs=6, size=7, size_std=1, p_incon=0)
    # infr = demodata_infr(num_pccs=3, size=5, size_std=.2, p_incon=0)
    infr = demodata_infr(pcc_sizes=[5, 2, 4])
    infr.verbose = 100
    # apply_dummy_viewpoints(infr)
    # infr.ensure_cliques()
    infr.ensure_cliques()
    infr.ensure_full()
    # infr.apply_edge_truth()
    # Dummy scoring

    infr.init_simulation(oracle_accuracy=oracle_accuracy, name='demo2')

    # infr_gt = infr.copy()

    dpath = ut.ensuredir(ut.truepath('~/Desktop/demo'))
    ut.remove_files_in_dir(dpath)

    fig_counter = it.count(0)

    def show_graph(infr, title, final=False, selected_edges=None):
        if not VISUALIZE:
            return
        # TODO: rich colored text?
        latest = '\n'.join(infr.latest_logs())
        showkw = dict(
            # fontsize=infr.graph.graph['fontsize'],
            # fontname=infr.graph.graph['fontname'],
            show_unreviewed_edges=True,
            show_inferred_same=False,
            show_inferred_diff=False,
            outof=(len(infr.aids)),
            # show_inferred_same=True,
            # show_inferred_diff=True,
            selected_edges=selected_edges,
            show_labels=True,
            simple_labels=True,
            # show_recent_review=not final,
            show_recent_review=False,
            # splines=infr.graph.graph['splines'],
            reposition=False,
            # with_colorbar=True
        )
        verbose = infr.verbose
        infr.verbose = 0
        infr_ = infr.copy()
        infr_ = infr
        infr_.verbose = verbose
        infr_.show(pickable=True, verbose=0, **showkw)
        infr.verbose = verbose
        # logger.info('status ' + ut.repr4(infr_.status()))
        # infr.show(**showkw)
        ax = pt.gca()
        pt.set_title(title, fontsize=20)
        fig = pt.gcf()
        fontsize = 22
        if True:
            # postprocess xlabel
            lines = []
            for line in latest.split('\n'):
                if False and line.startswith('ORACLE ERROR'):
                    lines += ['ORACLE ERROR']
                else:
                    lines += [line]
            latest = '\n'.join(lines)
            if len(lines) > 10:
                fontsize = 16
            if len(lines) > 12:
                fontsize = 14
            if len(lines) > 14:
                fontsize = 12
            if len(lines) > 18:
                fontsize = 10

            if len(lines) > 23:
                fontsize = 8

        if True:
            pt.adjust_subplots(top=0.95, left=0, right=1, bottom=0.45, fig=fig)
            ax.set_xlabel('\n' + latest)
            xlabel = ax.get_xaxis().get_label()
            xlabel.set_horizontalalignment('left')
            # xlabel.set_x(.025)
            xlabel.set_x(-0.6)
            # xlabel.set_fontname('CMU Typewriter Text')
            xlabel.set_fontname('Inconsolata')
            xlabel.set_fontsize(fontsize)
        ax.set_aspect('equal')

        # ax.xaxis.label.set_color('red')

        from os.path import join

        fpath = join(dpath, 'demo_{:04d}.png'.format(next(fig_counter)))
        fig.savefig(
            fpath,
            dpi=300,
            # transparent=True,
            edgecolor='none',
        )

        # pt.save_figure(dpath=dpath, dpi=300)
        infr.latest_logs()

    if VISUALIZE:
        infr.update_visual_attrs(groupby='name_label')
        infr.set_node_attrs('pin', 'true')
        node_dict = ut.nx_node_dict(infr.graph)
        logger.info(ut.repr4(node_dict[1]))

    if VISUALIZE:
        infr.latest_logs()
        # Pin Nodes into the target groundtruth position
        show_graph(infr, 'target-gt')

    logger.info(ut.repr4(infr.status()))
    infr.clear_feedback()
    infr.clear_name_labels()
    infr.clear_edges()
    logger.info(ut.repr4(infr.status()))
    infr.latest_logs()

    if VISUALIZE:
        infr.update_visual_attrs()

    infr.prioritize('prob_match')
    if VISUALIZE or TARGET_REVIEW is None or TARGET_REVIEW == 0:
        show_graph(infr, 'initial state')

    def on_new_candidate_edges(infr, edges):
        # hack updateing visual attrs as a callback
        infr.update_visual_attrs()

    infr.on_new_candidate_edges = on_new_candidate_edges

    infr.params.update(**params)
    infr.refresh_candidate_edges()

    VIZ_ALL = VISUALIZE and TARGET_REVIEW is None and START is None
    logger.info('VIZ_ALL = %r' % (VIZ_ALL, ))

    if VIZ_ALL or TARGET_REVIEW == 0:
        show_graph(infr, 'find-candidates')

    # _iter2 = enumerate(infr.generate_reviews(**params))
    # _iter2 = list(_iter2)
    # assert len(_iter2) > 0

    # prog = ut.ProgIter(_iter2, label='demo2', bs=False, adjust=False,
    #                    enabled=False)
    count = 1
    first = 1
    for edge, priority in infr._generate_reviews(data=True):
        msg = 'review #%d, priority=%.3f' % (count, priority)
        logger.info('\n----------')
        infr.print('pop edge {} with priority={:.3f}'.format(edge, priority))
        # logger.info('remaining_reviews = %r' % (infr.remaining_reviews()),)
        # Make the next review

        if START is not None:
            VIZ_ALL = count >= START

        if END is not None and count >= END:
            break

        infr.print(msg)
        if ut.allsame(infr.pos_graph.node_labels(*edge)) and first:
            # Have oracle make a mistake early
            feedback = infr.request_oracle_review(edge, accuracy=0)
            first -= 1
        else:
            feedback = infr.request_oracle_review(edge)

        AT_TARGET = TARGET_REVIEW is not None and count >= TARGET_REVIEW - 1

        SHOW_CANDIATE_POP = True
        if SHOW_CANDIATE_POP and (VIZ_ALL or AT_TARGET):
            # import utool
            # utool.embed()
            infr.print(
                ut.repr2(infr.task_probs['match_state'][edge],
                         precision=4,
                         si=True))
            infr.print('len(queue) = %r' % (len(infr.queue)))
            # Show edge selection
            infr.print('Oracle will predict: ' + feedback['evidence_decision'])
            show_graph(infr, 'pre' + msg, selected_edges=[edge])

        if count == TARGET_REVIEW:
            infr.EMBEDME = QUIT_OR_EMEBED == 'embed'
        infr.add_feedback(edge, **feedback)
        infr.print('len(queue) = %r' % (len(infr.queue)))
        # infr.apply_nondynamic_update()
        # Show the result
        if VIZ_ALL or AT_TARGET:
            show_graph(infr, msg)
            # import sys
            # sys.exit(1)
        if count == TARGET_REVIEW:
            break
        count += 1

    infr.print('status = ' + ut.repr4(infr.status(extended=False)))
    show_graph(infr, 'post-review (#reviews={})'.format(count), final=True)

    # ROUND 2 FIGHT
    # if TARGET_REVIEW is None and round2_params is not None:
    #     # HACK TO GET NEW THINGS IN QUEUE
    #     infr.params = round2_params

    #     _iter2 = enumerate(infr.generate_reviews(**params))
    #     prog = ut.ProgIter(_iter2, label='round2', bs=False, adjust=False,
    #                        enabled=False)
    #     for count, (aid1, aid2) in prog:
    #         msg = 'reviewII #%d' % (count)
    #         logger.info('\n----------')
    #         logger.info(msg)
    #         logger.info('remaining_reviews = %r' % (infr.remaining_reviews()),)
    #         # Make the next review evidence_decision
    #         feedback = infr.request_oracle_review(edge)
    #         if count == TARGET_REVIEW:
    #             infr.EMBEDME = QUIT_OR_EMEBED == 'embed'
    #         infr.add_feedback(edge, **feedback)
    #         # Show the result
    #         if PRESHOW or TARGET_REVIEW is None or count >= TARGET_REVIEW - 1:
    #             show_graph(infr, msg)
    #         if count == TARGET_REVIEW:
    #             break

    #     show_graph(infr, 'post-re-review', final=True)

    if not getattr(infr, 'EMBEDME', False):
        if ut.get_computer_name().lower() in ['hyrule', 'ooo']:
            pt.all_figures_tile(monitor_num=0, percent_w=0.5)
        else:
            pt.all_figures_tile()
        ut.show_if_requested()
Beispiel #19
0
def convert_hsdb_to_ibeis(hsdir, dbdir=None, **kwargs):
    r"""
    Args
        hsdir (str): Directory to folder *containing* _hsdb
        dbdir (str): Output directory (defaults to same as  hsdb)

    CommandLine:
        python -m ibeis convert_hsdb_to_ibeis --dbdir ~/work/Frogs
        python -m ibeis convert_hsdb_to_ibeis --hsdir "/raid/raw/RotanTurtles/Roatan HotSpotter Nov_21_2016"

    Ignore:
        from ibeis.dbio.ingest_hsdb import *  # NOQA
        hsdir = "/raid/raw/RotanTurtles/Roatan HotSpotter Nov_21_2016"
        dbdir = "~/work/RotanTurtles"

    Example:
        >>> # SCRIPT
        >>> from ibeis.dbio.ingest_hsdb import *  # NOQA
        >>> dbdir = ut.get_argval('--dbdir', type_=str, default=None)
        >>> hsdir = ut.get_argval('--hsdir', type_=str, default=dbdir)
        >>> result = convert_hsdb_to_ibeis(hsdir)
        >>> print(result)
    """
    from ibeis.control import IBEISControl
    import utool as ut

    if dbdir is None:
        dbdir = hsdir
    print('[ingest] Ingesting hsdb: %r -> %r' % (hsdir, dbdir))

    assert is_hsdb(
        hsdir
    ), 'not a hotspotter database. cannot even force convert: hsdir=%r' % (
        hsdir, )
    assert not is_succesful_convert(dbdir), 'hsdir=%r is already converted' % (
        hsdir, )
    #print('FORCE DELETE: %r' % (hsdir,))
    #ibsfuncs.delete_ibeis_database(hsdir)
    imgdir = join(hsdir, 'images')

    internal_dir = get_hsinternal(hsdir)
    nametbl_fpath = join(internal_dir, 'name_table.csv')
    imgtbl_fpath = join(internal_dir, 'image_table.csv')
    chiptbl_fpath = join(internal_dir, 'chip_table.csv')

    # READ NAME TABLE
    name_text_list = ['____']
    name_hs_nid_list = [0]
    with open(nametbl_fpath, 'r') as nametbl_file:
        name_reader = csv.reader(nametbl_file)
        for ix, row in enumerate(name_reader):
            #if ix >= 3:
            if len(row) == 0 or row[0].strip().startswith('#'):
                continue
            else:
                hs_nid = int(row[0])
                name = row[1].strip()
                name_text_list.append(name)
                name_hs_nid_list.append(hs_nid)

    # READ IMAGE TABLE
    iamge_hs_gid_list = []
    image_gname_list = []
    image_reviewed_list = []
    with open(imgtbl_fpath, 'r') as imgtb_file:
        image_reader = csv.reader(imgtb_file)
        for ix, row in enumerate(image_reader):
            if len(row) == 0 or row[0].strip().startswith('#'):
                continue
            else:
                hs_gid = int(row[0])
                gname_ = row[1].strip()
                # aif in hotspotter is equivilant to reviewed in IBEIS
                reviewed = bool(row[2])
                iamge_hs_gid_list.append(hs_gid)
                image_gname_list.append(gname_)
                image_reviewed_list.append(reviewed)

    image_gpath_list = [join(imgdir, gname) for gname in image_gname_list]

    ut.debug_duplicate_items(image_gpath_list)
    #print(image_gpath_list)
    image_exist_flags = list(map(exists, image_gpath_list))
    missing_images = []
    for image_gpath, flag in zip(image_gpath_list, image_exist_flags):
        if not flag:
            missing_images.append(image_gpath)
            print('Image does not exist: %s' % image_gpath)

    if not all(image_exist_flags):
        print('Only %d / %d image exist' %
              (sum(image_exist_flags), len(image_exist_flags)))

    SEARCH_FOR_IMAGES = False
    if SEARCH_FOR_IMAGES:
        # Hack to try and find the missing images
        from os.path import basename
        subfiles = ut.glob(hsdir,
                           '*',
                           recursive=True,
                           fullpath=True,
                           with_files=True)
        basename_to_existing = ut.group_items(subfiles,
                                              ut.lmap(basename, subfiles))

        can_copy_list = []
        for gpath in missing_images:
            gname = basename(gpath)
            if gname not in basename_to_existing:
                print('gname = %r' % (gname, ))
                pass
            else:
                existing = basename_to_existing[gname]
                can_choose = True
                if len(existing) > 1:
                    if not ut.allsame(ut.lmap(ut.get_file_uuid, existing)):
                        can_choose = False
                if can_choose:
                    found = existing[0]
                    can_copy_list.append((found, gpath))
                else:
                    print(existing)

        src, dst = ut.listT(can_copy_list)
        ut.copy_list(src, dst)

    # READ CHIP TABLE
    chip_bbox_list = []
    chip_theta_list = []
    chip_hs_nid_list = []
    chip_hs_gid_list = []
    chip_note_list = []
    with open(chiptbl_fpath, 'r') as chiptbl_file:
        chip_reader = csv.reader(chiptbl_file)
        for ix, row in enumerate(chip_reader):
            if len(row) == 0 or row[0].strip().startswith('#'):
                continue
            else:
                hs_gid = int(row[1])
                hs_nid = int(row[2])
                bbox_text = row[3]
                theta = float(row[4])
                notes = '<COMMA>'.join([item.strip() for item in row[5:]])

                bbox_text = bbox_text.replace('[', '').replace(']', '').strip()
                bbox_text = re.sub('  *', ' ', bbox_text)
                bbox_strlist = bbox_text.split(' ')
                bbox = tuple(map(int, bbox_strlist))
                #bbox = [int(item) for item in bbox_strlist]
                chip_hs_nid_list.append(hs_nid)
                chip_hs_gid_list.append(hs_gid)
                chip_bbox_list.append(bbox)
                chip_theta_list.append(theta)
                chip_note_list.append(notes)

    names = ut.ColumnLists({
        'hs_nid': name_hs_nid_list,
        'text': name_text_list,
    })

    images = ut.ColumnLists({
        'hs_gid': iamge_hs_gid_list,
        'gpath': image_gpath_list,
        'reviewed': image_reviewed_list,
        'exists': image_exist_flags,
    })

    chips = ut.ColumnLists({
        'hs_gid': chip_hs_gid_list,
        'hs_nid': chip_hs_nid_list,
        'bbox': chip_bbox_list,
        'theta': chip_theta_list,
        'note': chip_note_list,
    })

    IGNORE_MISSING_IMAGES = True
    if IGNORE_MISSING_IMAGES:
        # Ignore missing information
        print('pre')
        print('chips = %r' % (chips, ))
        print('images = %r' % (images, ))
        print('names = %r' % (names, ))
        missing_gxs = ut.where(ut.not_list(images['exists']))
        missing_gids = ut.take(images['hs_gid'], missing_gxs)
        gid_to_cxs = ut.dzip(*chips.group_indicies('hs_gid'))
        missing_cxs = ut.flatten(ut.take(gid_to_cxs, missing_gids))
        # Remove missing images and dependant chips
        images = images.remove(missing_gxs)
        chips = chips.remove(missing_cxs)
        valid_nids = set(chips['hs_nid'] + [0])
        isvalid = [nid in valid_nids for nid in names['hs_nid']]
        names = names.compress(isvalid)
        print('post')
        print('chips = %r' % (chips, ))
        print('images = %r' % (images, ))
        print('names = %r' % (names, ))

    assert all(images['exists']), 'some images dont exist'

    # if gid is None:
    #     print('Not adding the ix=%r-th Chip. Its image is corrupted image.' % (ix,))
    #     # continue
    # # Build mappings to new indexes
    # names_nid_to_nid  = {names_nid: nid for (names_nid, nid) in zip(hs_nid_list, nid_list)}
    # names_nid_to_nid[1] = names_nid_to_nid[0]  # hsdb unknknown is 0 or 1
    # images_gid_to_gid = {images_gid: gid for (images_gid, gid) in zip(hs_gid_list, gid_list)}

    ibs = IBEISControl.request_IBEISController(dbdir=dbdir,
                                               check_hsdb=False,
                                               **kwargs)
    assert len(ibs.get_valid_gids()) == 0, 'target database is not empty'

    # Add names, images, and annotations
    names['ibs_nid'] = ibs.add_names(names['text'])
    images['ibs_gid'] = ibs.add_images(
        images['gpath'])  # any failed gids will be None

    if True:
        # Remove corrupted images
        print('pre')
        print('chips = %r' % (chips, ))
        print('images = %r' % (images, ))
        print('names = %r' % (names, ))
        missing_gxs = ut.where(ut.flag_None_items(images['ibs_gid']))
        missing_gids = ut.take(images['hs_gid'], missing_gxs)
        gid_to_cxs = ut.dzip(*chips.group_indicies('hs_gid'))
        missing_cxs = ut.flatten(ut.take(gid_to_cxs, missing_gids))
        # Remove missing images and dependant chips
        chips = chips.remove(missing_cxs)
        images = images.remove(missing_gxs)
        print('post')
        print('chips = %r' % (chips, ))
        print('images = %r' % (images, ))
        print('names = %r' % (names, ))

    # Index chips using new ibs rowids
    ibs_gid_lookup = ut.dzip(images['hs_gid'], images['ibs_gid'])
    ibs_nid_lookup = ut.dzip(names['hs_nid'], names['ibs_nid'])
    try:
        chips['ibs_gid'] = ut.take(ibs_gid_lookup, chips['hs_gid'])
    except KeyError:
        chips['ibs_gid'] = [
            ibs_gid_lookup.get(index, None) for index in chips['hs_gid']
        ]
    try:
        chips['ibs_nid'] = ut.take(ibs_nid_lookup, chips['hs_nid'])
    except KeyError:
        chips['ibs_nid'] = [
            ibs_nid_lookup.get(index, None) for index in chips['hs_nid']
        ]

    ibs.add_annots(chips['ibs_gid'],
                   bbox_list=chips['bbox'],
                   theta_list=chips['theta'],
                   nid_list=chips['ibs_nid'],
                   notes_list=chips['note'])

    # aid_list = ibs.get_valid_aids()
    # flag_list = [True] * len(aid_list)
    # ibs.set_annot_exemplar_flags(aid_list, flag_list)
    # assert(all(ibs.get_annot_exemplar_flags(aid_list))), 'exemplars not set correctly'

    # Write file flagging successful conversion
    with open(join(ibs.get_ibsdir(), SUCCESS_FLAG_FNAME), 'w') as file_:
        file_.write('Successfully converted hsdir=%r' % (hsdir, ))
    print('finished ingest')
    return ibs
Beispiel #20
0
def exec_interactive_incremental_queries(ibs, qaid_list, back=None):
    assert ut.allsame(
        ibs.get_annot_species_rowids(qaid_list)), 'must be all on same species'
    self = IncQueryHarness()
    self = self.begin_incremental_query(ibs, qaid_list, back=back)