Example #1
0
    def group_pfiles(cls, pfiles, step_idx=None):
        """
        Creates groups of pfiles that *might* be the same.

        Example:
            >>> fpaths = _demodata_files()
            >>> pfiles = [ProgressiveFile(f) for f in fpaths]
            >>> groups1 = ProgressiveFile.group_pfiles(pfiles)
            >>> for pfile in pfiles:
            >>>     pfile.refine()
            >>> groups2 = ProgressiveFile.group_pfiles(pfiles)
            >>> for pfile in pfiles[0::2]:
            >>>     pfile.refine()
            >>> groups3 = ProgressiveFile.group_pfiles(pfiles)
            >>> for pfile in pfiles[1::2]:
            >>>     pfile.refine()
            >>> groups4 = ProgressiveFile.group_pfiles(pfiles)
        """
        if step_idx is not None:
            # We are given the step idx to use, so do that
            final_groups = ub.group_items(pfiles,
                                          key=lambda x: x.step_id(step_idx))
        else:
            # Otherwise do something reasonable
            size_groups = ub.group_items(pfiles, key=lambda x: x.size)
            final_groups = ub.ddict(list)
            for group in size_groups.values():
                # we have to use the minimum refine step available
                # for any unfinished pfile to ensure consistency
                step_idx = ProgressiveFile.compatible_step_idx(group)
                step_groups = ub.group_items(group,
                                             key=lambda x: x.step_id(step_idx))
                for key, val in step_groups.items():
                    final_groups[key].extend(val)
        return final_groups
Example #2
0
    def init_test_mode(infr):
        from graphid.core import nx_dynamic_graph
        infr.print('init_test_mode')
        infr.test_mode = True
        # infr.edge_truth = {}
        infr.metrics_list = []
        infr.test_state = {
            'n_decision': 0,
            'n_algo': 0,
            'n_manual': 0,
            'n_true_merges': 0,
            'n_error_edges': 0,
            'confusion': None,
        }
        infr.test_gt_pos_graph = nx_dynamic_graph.DynConnGraph()
        infr.test_gt_pos_graph.add_nodes_from(infr.aids)
        infr.nid_to_gt_cc = ub.group_items(infr.aids, infr.orig_name_labels)
        infr.node_truth = ub.dzip(infr.aids, infr.orig_name_labels)

        # infr.real_n_pcc_mst_edges = sum(
        #     len(cc) - 1 for cc in infr.nid_to_gt_cc.values())
        # util.cprint('real_n_pcc_mst_edges = %r' % (
        #     infr.real_n_pcc_mst_edges,), 'red')

        infr.metrics_list = []
        infr.nid_to_gt_cc = ub.group_items(infr.aids, infr.orig_name_labels)
        infr.real_n_pcc_mst_edges = sum(
            len(cc) - 1 for cc in infr.nid_to_gt_cc.values())
        infr.print('real_n_pcc_mst_edges = %r' % (infr.real_n_pcc_mst_edges, ),
                   color='red')
Example #3
0
def test_group_items_sorted():
    pairs = [
        ('ham', 'protein'),
        ('jam', 'fruit'),
        ('spam', 'protein'),
        ('eggs', 'protein'),
        ('cheese', 'dairy'),
        ('banana', 'fruit'),
    ]
    item_list, groupid_list = zip(*pairs)
    result1 = ub.group_items(item_list, groupid_list, sorted_=False)
    result2 = ub.group_items(item_list, groupid_list, sorted_=True)
    result1 = ub.map_vals(set, result1)
    result2 = ub.map_vals(set, result2)
    assert result1 == result2
Example #4
0
    def fix_conference_places(bibman):

        pubman = constants_tex_fixes.PubManager()

        needed = set()

        for entry in bibman.cleaned.values():
            if entry['pub_type'] == 'conference':
                accro, year = (entry['pub_accro'], entry['year'])
                pub = pubman.find(accro)
                if pub.places is None or int(year) not in pub.places:
                    needed.add((accro, year))
                else:
                    place = pub.places[int(year)]
                    print('place = {!r}'.format(place))
                    entry['address'] = place

        if needed:
            needed = list(needed)
            used_years = ub.group_items(needed, ut.take_column(needed, 0))
            for k, v in list(used_years.items()):
                used_years[k] = sorted(v)

            sortby = ub.map_vals(lambda vs: (len(vs), max(e[1] for e in vs)),
                                 used_years)
            used_years = ut.order_dict_by(used_years, ub.argsort(sortby))
            print('NEED CONFERENCE LOCATIONS')
            print(ub.repr2(used_years, nl=2))
Example #5
0
 def _print_previous_loop_statistics(infr, count):
     # Print stats about what happend in the this loop
     history = infr.metrics_list[-count:]
     recover_blocks = ub.group_items([
         (k, sum(1 for i in g))
         for k, g in it.groupby(util.take_column(history, 'recovering'))
     ]).get(True, [])
     infr.print(
         ('Recovery mode entered {} times, '
          'made {} recovery decisions.').format(len(recover_blocks),
                                                sum(recover_blocks)),
         color='green')
     testaction_hist = ub.dict_hist(util.take_column(
         history, 'test_action'))
     infr.print('Test Action Histogram: {}'.format(
         ub.repr2(testaction_hist, si=True)),
                color='yellow')
     if infr.params['inference.enabled']:
         action_hist = ub.dict_hist(
             util.emap(frozenset, util.take_column(history, 'action')))
         infr.print('Inference Action Histogram: {}'.format(
             ub.repr2(action_hist, si=True)),
                    color='yellow')
     infr.print('Decision Histogram: {}'.format(
         ub.repr2(ub.dict_hist(util.take_column(history, 'pred_decision')),
                  si=True)),
                color='yellow')
     infr.print('User Histogram: {}'.format(
         ub.repr2(ub.dict_hist(util.take_column(history, 'user_id')),
                  si=True)),
                color='yellow')
Example #6
0
    def find_connecting_edges(infr):
        """
        Searches for a small set of edges, which if reviewed as positive would
        ensure that each PCC is k-connected.  Note that in somes cases this is
        not possible
        """
        label = 'name_label'
        node_to_label = infr.get_node_attrs(label)
        label_to_nodes = ub.group_items(node_to_label.keys(),
                                        node_to_label.values())

        # k = infr.params['redun.pos']
        k = 1
        new_edges = []
        prog = ub.ProgIter(list(label_to_nodes.keys()),
                           desc='finding connecting edges',
                           enabled=infr.verbose > 0)
        for nid in prog:
            nodes = set(label_to_nodes[nid])
            G = infr.pos_graph.subgraph(nodes, dynamic=False)
            impossible = nxu.edges_inside(infr.neg_graph, nodes)
            impossible |= nxu.edges_inside(infr.incomp_graph, nodes)

            candidates = set(nx.complement(G).edges())
            candidates.difference_update(impossible)

            aug_edges = nxu.k_edge_augmentation(G, k=k, avail=candidates)
            new_edges += aug_edges
        prog.ensure_newline()
        return new_edges
Example #7
0
def master():
    master_fpath = ub.grabdata(
        'https://raw.githubusercontent.com/pokemongo-dev-contrib/pokemongo-game-master/master/versions/latest/V2_GAME_MASTER.json',
        expires=24 * 60 * 60)
    with open(master_fpath) as file:
        master = json.load(file)

    master.keys()

    def item_type(item):
        data = item['data']
        if 'move' in data:
            return 'move'
        if 'pokemon' in data:
            return 'pokemon'

    type_to_items = ub.group_items(master['template'], key=item_type)

    pokemon_items = type_to_items['pokemon']  # NOQA
    move_items = type_to_items['move']

    for item in move_items:
        uid = item['data']['move']['uniqueId']
        if 'MOONBLAST' in uid:
            print('item = {}'.format(ub.repr2(item, nl=3)))
Example #8
0
def test_group_items_callable():
    pairs = [
        ('ham', 'protein'),
        ('jam', 'fruit'),
        ('spam', 'protein'),
        ('eggs', 'protein'),
        ('cheese', 'dairy'),
        ('banana', 'fruit'),
    ]
    items, groupids = zip(*pairs)
    lut = dict(zip(items, groupids))

    result1 = ub.group_items(items, groupids)
    result2 = ub.group_items(items, lut.__getitem__)

    result1 = ub.map_vals(set, result1)
    result2 = ub.map_vals(set, result2)
    assert result1 == result2
Example #9
0
    def draw(self, color='blue', ax=None, alpha=None, coord_axes=[1, 0],
             radius=1):
        """
        Note:
            unlike other methods, the defaults assume x/y internal data

        Args:
            coord_axes (Tuple): specify which image axes each coordinate dim
                corresponds to.  For 2D images,
                    if you are storing r/c data, set to [0,1],
                    if you are storing x/y data, set to [1,0].

        Example:
            >>> # xdoc: +REQUIRES(module:kwplot)
            >>> from kwimage.structs.coords import *  # NOQA
            >>> self = Coords.random(10)
            >>> # xdoc: +REQUIRES(--show)
            >>> self.draw(radius=3.0)
            >>> import kwplot
            >>> kwplot.autompl()
            >>> self.draw(radius=3.0)
        """
        import matplotlib as mpl
        import kwimage
        from matplotlib import pyplot as plt
        if ax is None:
            ax = plt.gca()
        data = self.data

        if self.dim != 2:
            raise NotImplementedError('need 2d for mpl')

        # More grouped patches == more efficient runtime
        if alpha is None:
            alpha = [1.0] * len(data)
        elif not ub.iterable(alpha):
            alpha = [alpha] * len(data)

        ptcolors = [kwimage.Color(color, alpha=a).as01('rgba') for a in alpha]
        color_groups = ub.group_items(range(len(ptcolors)), ptcolors)

        default_centerkw = {
            'radius': radius,
            'fill': True
        }
        centerkw = default_centerkw.copy()
        collections = []
        for pcolor, idxs in color_groups.items():
            yx_list = [row[coord_axes] for row in data[idxs]]
            patches = [
                mpl.patches.Circle((x, y), ec=None, fc=pcolor, **centerkw)
                for y, x in yx_list
            ]
            col = mpl.collections.PatchCollection(patches, match_original=True)
            collections.append(col)
            ax.add_collection(col)
        return collections
Example #10
0
def test_group_items_sorted_mixed_types():
    import random
    groupid_list = [
        1,
        2,
        3,
        1,
        2,
        3,
        1,
        2,
        3,
        1,
        2,
        3,
        '1',
        '2',
        '3',
        '1',
        '2',
        '3',
        '1',
        '2',
        '3',
        '1',
        '2',
        '3',
    ]
    item_list = list(range(len(groupid_list)))

    # Randomize the order
    random.Random(947043).shuffle(groupid_list)
    random.Random(947043).shuffle(item_list)

    result1 = ub.group_items(item_list, groupid_list, sorted_=True)
    result2 = ub.group_items(item_list, groupid_list, sorted_=False)

    result1 = ub.map_vals(set, result1)
    result2 = ub.map_vals(set, result2)
    assert result1 == result2

    assert '1' in result1
    assert 1 in result1
Example #11
0
def _devcheck_manage_monitor(workdir, dry=True):

    all_sessions = collect_sessions(workdir)

    # Get all the images in the monitor directories
    # (this is a convention and not something netharn does by default)

    all_files = []
    # factor = 100
    max_keep = 300

    def _choose_action(file_infos):
        import kwarray
        file_infos = kwarray.shuffle(file_infos, rng=0)
        n_keep = max_keep
        # n_keep = (len(file_infos) // factor) + 1
        # n_keep = min(max_keep, n_keep)

        for info in file_infos[:n_keep]:
            info['action'] = 'keep'
        for info in file_infos[n_keep:]:
            info['action'] = 'delete'

    for session in ub.ProgIter(all_sessions, desc='checking monitor files'):
        dpaths = [
            join(session.dpath, 'monitor', 'train', 'batch'),
            join(session.dpath, 'monitor', 'vali', 'batch'),
            join(session.dpath, 'monitor', 'train'),
            join(session.dpath, 'monitor', 'vali'),
        ]
        exts = ['*.jpg', '*.png']
        for dpath in dpaths:
            for ext in exts:
                fpaths = list(glob.glob(join(dpath, ext)))
                file_infos = [{
                    'size': os.stat(p).st_size,
                    'fpath': p
                } for p in fpaths]
                _choose_action(file_infos)
                all_files.extend(file_infos)

    grouped_actions = ub.group_items(all_files, lambda x: x['action'])

    for key, group in grouped_actions.items():
        size = byte_str(sum([s['size'] for s in group]))
        print('{:>4} images:  {:>4}, size={}'.format(key.capitalize(),
                                                     len(group), size))

    if dry:
        print('Dry run')
    else:
        delete = grouped_actions.get('delete', [])
        delete_fpaths = [item['fpath'] for item in delete]
        for p in ub.ProgIter(delete_fpaths, desc='deleting'):
            ub.delete(p)
Example #12
0
def test_class_torch():
    import numpy as np
    import torch
    import netharn as nh
    import ubelt as ub
    # from netharn.util.nms.torch_nms import torch_nms
    # from netharn.util import non_max_supression

    thresh = .5

    num = 500
    rng = nh.util.ensure_rng(0)
    cpu_boxes = nh.util.Boxes.random(num,
                                     scale=400.0,
                                     rng=rng,
                                     format='tlbr',
                                     tensor=True)
    cpu_tlbr = cpu_boxes.to_tlbr().data
    # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr)))
    # make all scores unique to ensure comparability
    cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr)))
    cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr)))

    tlbr = cpu_boxes.to_tlbr().data.to('cuda')
    scores = cpu_scores.to('cuda')
    classes = cpu_cls.to('cuda')

    keep1 = []
    for idxs in ub.group_items(range(len(classes)),
                               classes.cpu().numpy()).values():
        # cls_tlbr = tlbr.take(idxs, axis=0)
        # cls_scores = scores.take(idxs, axis=0)
        cls_tlbr = tlbr[idxs]
        cls_scores = scores[idxs]
        cls_keep = torch_nms(cls_tlbr, cls_scores, thresh=thresh, bias=0)
        keep1.extend(list(ub.compress(idxs, cls_keep.cpu().numpy())))
    keep1 = sorted(keep1)

    keep_ = torch_nms(tlbr, scores, classes=classes, thresh=thresh, bias=0)
    keep2 = np.where(keep_.cpu().numpy())[0].tolist()

    keep3 = nh.util.non_max_supression(tlbr.cpu().numpy(),
                                       scores.cpu().numpy(),
                                       classes=classes.cpu().numpy(),
                                       thresh=thresh,
                                       bias=0,
                                       impl='gpu')

    print(len(keep1))
    print(len(keep2))
    print(len(keep3))

    print(set(keep1) - set(keep2))
    print(set(keep2) - set(keep1))
Example #13
0
def test_class_torch():
    import numpy as np
    import torch
    import ubelt as ub
    import kwarray
    import kwimage

    thresh = .5

    num = 500
    rng = kwarray.ensure_rng(0)
    cpu_boxes = kwimage.Boxes.random(num,
                                     scale=400.0,
                                     rng=rng,
                                     format='ltrb',
                                     tensor=True)
    cpu_ltrb = cpu_boxes.to_ltrb().data
    # cpu_scores = torch.Tensor(rng.rand(len(cpu_ltrb)))
    # make all scores unique to ensure comparability
    cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_ltrb)))
    cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_ltrb)))

    ltrb = cpu_boxes.to_ltrb().data.to('cuda')
    scores = cpu_scores.to('cuda')
    classes = cpu_cls.to('cuda')

    keep1 = []
    for idxs in ub.group_items(range(len(classes)),
                               classes.cpu().numpy()).values():
        # cls_ltrb = ltrb.take(idxs, axis=0)
        # cls_scores = scores.take(idxs, axis=0)
        cls_ltrb = ltrb[idxs]
        cls_scores = scores[idxs]
        cls_keep = torch_nms(cls_ltrb, cls_scores, thresh=thresh, bias=0)
        keep1.extend(list(ub.compress(idxs, cls_keep.cpu().numpy())))
    keep1 = sorted(keep1)

    keep_ = torch_nms(ltrb, scores, classes=classes, thresh=thresh, bias=0)
    keep2 = np.where(keep_.cpu().numpy())[0].tolist()

    keep3 = kwimage.non_max_supression(ltrb.cpu().numpy(),
                                       scores.cpu().numpy(),
                                       classes=classes.cpu().numpy(),
                                       thresh=thresh,
                                       bias=0,
                                       impl='gpu')

    print(len(keep1))
    print(len(keep2))
    print(len(keep3))

    print(set(keep1) - set(keep2))
    print(set(keep2) - set(keep1))
Example #14
0
def randomized_ibeis_dset(dbname, dim=224):
    """
    Ignore:
        >>> from clab.live.siam_train import *
        >>> datasets = randomized_ibeis_dset('PZ_MTEST')
        >>> ut.qtensure()
        >>> self = datasets['train']
        >>> self.augment = True
        >>> self.show_sample()
    """
    # from clab.live.siam_train import *
    # dbname = 'PZ_MTEST'
    import utool as ut
    from ibeis.algo.verif import vsone
    # pblm = vsone.OneVsOneProblem.from_empty('PZ_MTEST')
    pblm = vsone.OneVsOneProblem.from_empty(dbname)

    pccs = list(pblm.infr.positive_components())
    pcc_freq = list(map(len, pccs))
    freq_grouped = ub.group_items(pccs, pcc_freq)

    # Simpler very randomized sample strategy
    train_pccs = []
    vali_pccs = []
    test_pccs = []
    import math

    # vali_frac = .1
    test_frac = .1
    vali_frac = 0

    for i, group in freq_grouped.items():
        group = ut.shuffle(group, rng=432232 + i)
        n_test = 0 if len(group) == 1 else math.ceil(len(group) * test_frac)
        test, learn = group[:n_test], group[n_test:]
        n_vali = 0 if len(group) == 1 else math.ceil(len(learn) * vali_frac)
        vali, train = group[:n_vali], group[-n_vali:]
        train_pccs.extend(train)
        test_pccs.extend(test)
        vali_pccs.extend(vali)

    test_dataset = RandomBalancedIBEISSample(pblm, test_pccs, dim=dim)
    train_dataset = RandomBalancedIBEISSample(pblm, train_pccs, dim=dim)
    vali_dataset = RandomBalancedIBEISSample(pblm, vali_pccs, dim=dim)
    train_dataset.augment = True

    datasets = {
        'train': train_dataset,
        # 'vali': vali_dataset,
        'test': test_dataset,
    }
    return datasets
Example #15
0
def randomized_ibeis_dset(dbname, dim=224):
    """
    CommandLine:
        xdoctest ~/code/netharn/netharn/examples/siam_ibeis.py randomized_ibeis_dset --show

    Example:
        >>> datasets = randomized_ibeis_dset('PZ_MTEST')
        >>> # xdoctest: +REQUIRES(--show)
        >>> nh.util.qtensure()
        >>> self = datasets['train']
        >>> self.show_sample()
        >>> nh.util.show_if_requested()
    """
    import math
    from ibeis.algo.verif import vsone
    pblm = vsone.OneVsOneProblem.from_empty(dbname)

    pccs = list(pblm.infr.positive_components())
    pcc_freq = list(map(len, pccs))
    freq_grouped = ub.group_items(pccs, pcc_freq)

    # Simpler very randomized sample strategy
    train_pccs = []
    vali_pccs = []
    test_pccs = []

    vali_frac = .1
    test_frac = .1

    for i, group in freq_grouped.items():
        group = nh.util.shuffle(group, rng=432232 + i)
        n_test = 0 if len(group) == 1 else math.ceil(len(group) * test_frac)
        test, learn = group[:n_test], group[n_test:]
        n_vali = 0 if len(group) == 1 else math.ceil(len(learn) * vali_frac)
        vali, train = group[:n_vali], group[-n_vali:]
        train_pccs.extend(train)
        test_pccs.extend(test)
        vali_pccs.extend(vali)

    test_dataset = RandomBalancedIBEISSample(pblm, test_pccs, dim=dim)
    train_dataset = RandomBalancedIBEISSample(pblm, train_pccs, dim=dim,
                                              augment=False)
    vali_dataset = RandomBalancedIBEISSample(pblm, vali_pccs, dim=dim,
                                             augment=False)

    datasets = {
        'train': train_dataset,
        'vali': vali_dataset,
        'test': test_dataset,
    }
    datasets.pop('test', None)  # dont test for now (speed consideration)
    return datasets
Example #16
0
def main():
    grouped = ub.group_items(options, lambda x: x['type'])

    build = {}

    for key, values in grouped.items():
        print('key = {!r}'.format(key))
        values = sorted(values, key=lambda x: x['price'])
        chosen = values[-1]
        print('chosen = {!r}'.format(chosen))
        build[key] = chosen

    print('build = {}'.format(ub.repr2(build, nl=2)))
Example #17
0
def rank_inventory(inventory):
    candidates = list(ub.flatten(list(pkmn.family(ancestors=False, node=True))
                                 for pkmn in inventory))

    groups = ub.group_items(candidates, key=lambda p: p.name)

    leages = {
        'master': {'max_cp': float('inf')},
        'ultra': {'max_cp': 2500},
        'great': {'max_cp': 1500},
        'little': {'max_cp': 500},
    }

    max_level = 45  # for XL candy
    # max_level = 40  # normal

    all_dfs = []

    for name, group in groups.items():
        print('\n\n------------\n\n')
        print('name = {!r}'.format(name))
        for leage_name, leage_filters in leages.items():
            max_cp = leage_filters['max_cp']
            print('')
            print(' ========== ')
            print(' --- {} in {} --- '.format(name, leage_name))
            not_eligible = [p for p in group if p.cp is not None and p.cp > max_cp]
            eligible = [p for p in group if p.cp is None or p.cp <= max_cp]
            print('not_eligible = {!r}'.format(not_eligible))
            if len(eligible) > 0:
                first = ub.peek(eligible)
                have_ivs = eligible
                df = first.leage_rankings_for(have_ivs, max_cp=max_cp,
                                              max_level=max_level)
                all_dfs.append(df)
            else:
                print('none eligable')

    # Print out the best ranks for each set of IVS over all possible forms
    # (lets you know which ones can be transfered safely)

    iv_to_rank = ub.ddict(list)
    for df in all_dfs:
        if df is not None:
            df = df.set_index(['iva', 'ivd', 'ivs'])
            for iv, rank in zip(df.index, df['rank']):
                iv_to_rank[iv].append(rank)

    iv_to_best_rank = ub.map_vals(sorted, iv_to_rank)
    iv_to_best_rank = ub.sorted_vals(iv_to_best_rank)
    print('iv_to_best_rank = {}'.format(ub.repr2(iv_to_best_rank, nl=1, align=':')))
Example #18
0
 def finalize_dets(ready_dets, ready_gids):
     gid_to_ready_dets = ub.group_items(ready_dets, ready_gids)
     for gid, dets_list in gid_to_ready_dets.items():
         if len(dets_list) == 0:
             dets = kwimage.Detections.concatenate([])
         elif len(dets_list) == 1:
             dets = dets_list[0]
         elif len(dets_list) > 1:
             dets = kwimage.Detections.concatenate(dets_list)
             keep = dets.non_max_supression(
                 thresh=self.config['nms_thresh'],
             )
             dets = dets.take(keep)
         yield (gid, dets)
Example #19
0
 def progressive_duplicates(pfiles, idx=1):
     step_ids = [pfile.refined_to(idx) for pfile in ub.ProgIter(pfiles)]
     final_groups = {}
     grouped = ub.group_items(pfiles, step_ids)
     for key, group in grouped.items():
         if len(group) > 1:
             if all(not g.can_refine for g in group):
                 # Group is ~100% a real duplicate
                 final_groups[key] = group
             else:
                 pfiles = group
                 deduped = progressive_duplicates(pfiles, idx=idx + 1)
                 final_groups.update(deduped)
         else:
             final_groups[key] = group
     return final_groups
Example #20
0
    def find_clique_edges(infr, label='name_label'):
        """
        Augmenting edges that would complete each the specified cliques.
        (based on the group inferred from `label`)

        Args:
            label (str): node attribute to use as the group id to form the
                cliques.
        """
        node_to_label = infr.get_node_attrs(label)
        label_to_nodes = ub.group_items(node_to_label.keys(),
                                        node_to_label.values())
        new_edges = []
        for label, nodes in label_to_nodes.items():
            for edge in it.combinations(nodes, 2):
                if infr.edge_decision(edge) == UNREV:
                    new_edges.append(edge)
        return new_edges
Example #21
0
        def internal_deduplicate(self):
            hash_groups = ub.group_items(self.all_fpaths, self.all_hashes)
            hash_groups_dup = {
                k: v
                for k, v in hash_groups.items() if len(v) > 1
            }

            from os.path import dirname

            hash_groups_dup['ef46db3751d8e999']

            for key, values in hash_groups_dup.items():
                for v in values:
                    if v.endswith('.avi'):
                        break

                [basename(v) for v in values]
                [dirname(v) for v in values]
Example #22
0
    def print_graph_connections(infr, label='orig_name_label'):
        """
        label = 'orig_name_label'
        """
        node_to_label = infr.get_node_attrs(label)
        label_to_nodes = ub.group_items(node_to_label.keys(),
                                        node_to_label.values())
        print('CC info')
        for name, cc in label_to_nodes.items():
            print('\nname = %r' % (name, ))
            edges = list(nxu.edges_between(infr.graph, cc))
            print(infr.get_edge_df_text(edges))

        print('CC pair info')
        for (n1, cc1), (n2, cc2) in it.combinations(label_to_nodes.items(), 2):
            if n1 == n2:
                continue
            print('\nname_pair = {}-vs-{}'.format(n1, n2))
            edges = list(nxu.edges_between(infr.graph, cc1, cc2))
            print(infr.get_edge_df_text(edges))
Example #23
0
    def predict_proba_df(verif, edges):
        """
        CommandLine:
            python -m graphid.demo DummyVerif.predict_edges

        Example:
            >>> from graphid import demo
            >>> kwargs = dict(num_pccs=40, size=2)
            >>> infr = demo.demodata_infr(**kwargs)
            >>> verif = infr.dummy_verif
            >>> edges = list(infr.graph.edges())
            >>> probs = verif.predict_proba_df(edges)
        """
        infr = verif.infr
        edges = list(it.starmap(verif.infr.e_, edges))
        prob_cache = infr.task_probs['match_state']
        is_miss = np.array([e not in prob_cache for e in edges])
        # is_hit = ~is_miss
        if np.any(is_miss):
            miss_edges = list(ub.compress(edges, is_miss))
            miss_truths = [verif._get_truth(edge) for edge in miss_edges]
            grouped_edges = ub.group_items(miss_edges, miss_truths)
            # Need to make this determenistic too
            states = [POSTV, NEGTV, INCMP]
            for key in sorted(grouped_edges.keys()):
                group = grouped_edges[key]
                probs0 = util.randn(shape=[len(group)], rng=verif.rng, a_max=1,
                                    a_min=0, **verif.dummy_params[key])
                # Just randomly assign other probs
                probs1 = verif.rng.rand(len(group)) * (1 - probs0)
                probs2 = 1 - (probs0 + probs1)
                for edge, probs in zip(group, zip(probs0, probs1, probs2)):
                    prob_cache[edge] = ub.dzip(states, probs)

        probs = pd.DataFrame(
            list(ub.take(prob_cache, edges)),
            index=util.ensure_multi_index(edges, ('aid1', 'aid2'))
        )
        return probs
Example #24
0
def _dump_measures(tb_data,
                   out_dpath,
                   mode=None,
                   smoothing=0.0,
                   ignore_outliers=True):
    """
    This is its own function in case we need to modify formatting

    CommandLine:
        xdoctest -m netharn.mixins _dump_measures --out_dpath=.

    Example:
        >>> # SCRIPT
        >>> # Reread a dumped pickle file
        >>> from netharn.mixins import *  # NOQA
        >>> from netharn.mixins import _dump_monitor_tensorboard, _dump_measures
        >>> import json
        >>> from os.path import join
        >>> import ubelt as ub
        >>> try:
        >>>     import seaborn as sns
        >>>     sns.set()
        >>> except ImportError:
        >>>     pass
        >>> out_dpath = ub.expandpath('~/work/project/fit/nice/nicename/monitor/tensorboard/')
        >>> out_dpath = ub.argval('--out_dpath', default=out_dpath)
        >>> mode = ['epoch', 'iter']
        >>> fpath = join(out_dpath, 'tb_data.json')
        >>> tb_data = json.load(open(fpath, 'r'))
        >>> import kwplot
        >>> kwplot.autompl()
        >>> _dump_measures(tb_data,  out_dpath, smoothing=0)
    """
    import ubelt as ub
    from os.path import join
    import numpy as np
    import kwplot
    import matplotlib as mpl
    from kwplot.auto_backends import BackendContext

    with BackendContext('agg'):
        # kwplot.autompl()

        # TODO: Is it possible to get htop to show this process with some name that
        # distinguishes it from the dataloader workers?
        # import sys
        # import multiprocessing
        # if multiprocessing.current_process().name != 'MainProcess':
        #     if sys.platform.startswith('linux'):
        #         import ctypes
        #         libc = ctypes.cdll.LoadLibrary('libc.so.6')
        #         title = 'Netharn MPL Dump Measures'
        #         libc.prctl(len(title), title, 0, 0, 0)

        # NOTE: This cause warnings when exeucted as daemon process
        # try:
        #     import seaborn as sbn
        #     sbn.set()
        # except ImportError:
        #     pass

        valid_modes = ['epoch', 'iter']
        if mode is None:
            mode = valid_modes
        if ub.iterable(mode):
            # Hack: Call with all modes
            for mode_ in mode:
                _dump_measures(tb_data,
                               out_dpath,
                               mode=mode_,
                               smoothing=smoothing,
                               ignore_outliers=ignore_outliers)
            return
        else:
            assert mode in valid_modes

        meta = tb_data.get('meta', {})
        nice = meta.get('nice', '?nice?')
        special_groupers = meta.get('special_groupers', ['loss'])

        fig = kwplot.figure(fnum=1)

        plot_keys = [
            key for key in tb_data
            if ('train_' + mode in key or 'vali_' + mode in key or 'test_' +
                mode in key or mode + '_' in key)
        ]
        y01_measures = [
            '_acc',
            '_ap',
            '_mAP',
            '_auc',
            '_mcc',
            '_brier',
            '_mauc',
        ]
        y0_measures = ['error', 'loss']

        keys = set(tb_data.keys()).intersection(set(plot_keys))

        # print('mode = {!r}'.format(mode))
        # print('tb_data.keys() = {!r}'.format(tb_data.keys()))
        # print('plot_keys = {!r}'.format(plot_keys))
        # print('keys = {!r}'.format(keys))

        def smooth_curve(ydata, beta):
            """
            Curve smoothing algorithm used by tensorboard
            """
            import pandas as pd
            alpha = 1.0 - beta
            if alpha <= 0:
                return ydata
            ydata_smooth = pd.Series(ydata).ewm(alpha=alpha).mean().values
            return ydata_smooth

        def inlier_ylim(ydatas):
            """
            outlier removal used by tensorboard
            """
            low, high = None, None
            for ydata in ydatas:
                q1 = 0.05
                q2 = 0.95
                low_, high_ = np.quantile(ydata, [q1, q2])

                # Extrapolate how big the entire span should be based on inliers
                inner_q = q2 - q1
                inner_extent = high_ - low_
                extrap_total_extent = inner_extent / inner_q

                # amount of padding to add to either side
                missing_p1 = q1
                missing_p2 = 1 - q2
                frac1 = missing_p1 / (missing_p2 + missing_p1)
                frac2 = missing_p2 / (missing_p2 + missing_p1)
                missing_extent = extrap_total_extent - inner_extent

                pad1 = missing_extent * frac1
                pad2 = missing_extent * frac2

                low_ = low_ - pad1
                high_ = high_ + pad2

                low = low_ if low is None else min(low_, low)
                high = high_ if high is None else max(high_, high)
            return (low, high)

        # Hack values that we don't apply smoothing to
        HACK_NO_SMOOTH = ['lr', 'momentum']

        def tag_grouper(k):
            # parts = ['train_epoch', 'vali_epoch', 'test_epoch']
            # parts = [p.replace('epoch', 'mode') for p in parts]
            parts = [p + mode for p in ['train_', 'vali_', 'test_']]
            for p in parts:
                if p in k:
                    return p.split('_')[0]
            return 'unknown'

        GROUP_LOSSES = True
        GROUP_AND_INDIVIDUAL = False
        INDIVIDUAL_PLOTS = True
        GROUP_SPECIAL = True

        if GROUP_LOSSES:
            # Group all losses in one plot for comparison
            loss_keys = [k for k in keys if 'loss' in k]
            tagged_losses = ub.group_items(loss_keys, tag_grouper)
            tagged_losses.pop('unknown', None)
            kw = {}
            kw['ymin'] = 0.0
            # print('tagged_losses = {!r}'.format(tagged_losses))
            for tag, losses in tagged_losses.items():

                min_abs_y = .01
                min_y = 0
                xydata = ub.odict()
                for key in sorted(losses):
                    ydata = tb_data[key]['ydata']

                    if HACK_NO_SMOOTH not in key.split('_'):
                        ydata = smooth_curve(ydata, smoothing)

                    try:
                        min_y = min(min_y, ydata.min())
                        pos_ys = ydata[ydata > 0]
                        min_abs_y = min(min_abs_y, pos_ys.min())
                    except Exception:
                        pass

                    xydata[key] = (tb_data[key]['xdata'], ydata)

                kw['ymin'] = min_y

                if ignore_outliers:
                    low, kw['ymax'] = inlier_ylim(
                        [t[1] for t in xydata.values()])

                yscales = ['symlog', 'linear']
                for yscale in yscales:
                    fig.clf()
                    ax = fig.gca()
                    title = nice + '\n' + tag + '_' + mode + ' losses'
                    kwplot.multi_plot(xydata=xydata,
                                      ylabel='loss',
                                      xlabel=mode,
                                      yscale=yscale,
                                      title=title,
                                      fnum=1,
                                      ax=ax,
                                      **kw)
                    if yscale == 'symlog':
                        if LooseVersion(
                                mpl.__version__) >= LooseVersion('3.3'):
                            ax.set_yscale('symlog', linthresh=min_abs_y)
                        else:
                            ax.set_yscale('symlog', linthreshy=min_abs_y)
                    fname = '_'.join([tag, mode, 'multiloss', yscale]) + '.png'
                    fpath = join(out_dpath, fname)
                    ax.figure.savefig(fpath)

            # don't dump losses individually if we dump them in a group
            if not GROUP_AND_INDIVIDUAL:
                keys.difference_update(set(loss_keys))
                # print('keys = {!r}'.format(keys))

        if GROUP_SPECIAL:
            tag_groups = ub.group_items(keys, tag_grouper)
            tag_groups.pop('unknown', None)
            # Group items matching these strings
            kw = {}
            for tag, tag_keys in tag_groups.items():
                for groupname in special_groupers:
                    group_keys = [
                        k for k in tag_keys if groupname in k.split('_')
                    ]
                    if len(group_keys) > 1:
                        # Gather data for this group
                        xydata = ub.odict()
                        for key in sorted(group_keys):
                            ydata = tb_data[key]['ydata']
                            if HACK_NO_SMOOTH not in key.split('_'):
                                ydata = smooth_curve(ydata, smoothing)
                            xydata[key] = (tb_data[key]['xdata'], ydata)

                        if ignore_outliers:
                            low, kw['ymax'] = inlier_ylim(
                                [t[1] for t in xydata.values()])

                        yscales = ['linear']
                        for yscale in yscales:
                            fig.clf()
                            ax = fig.gca()
                            title = nice + '\n' + tag + '_' + mode + ' ' + groupname
                            kwplot.multi_plot(xydata=xydata,
                                              ylabel=groupname,
                                              xlabel=mode,
                                              yscale=yscale,
                                              title=title,
                                              fnum=1,
                                              ax=ax,
                                              **kw)
                            if yscale == 'symlog':
                                ax.set_yscale('symlog', linthreshy=min_abs_y)
                            fname = '_'.join([
                                tag, mode, 'group-' + groupname, yscale
                            ]) + '.png'
                            fpath = join(out_dpath, fname)
                            ax.figure.savefig(fpath)

                        if not GROUP_AND_INDIVIDUAL:
                            keys.difference_update(set(group_keys))

        if INDIVIDUAL_PLOTS:
            # print('keys = {!r}'.format(keys))
            for key in keys:
                d = tb_data[key]

                ydata = d['ydata']
                ydata = smooth_curve(ydata, smoothing)

                kw = {}
                if any(m.lower() in key.lower() for m in y01_measures):
                    kw['ymin'] = 0.0
                    kw['ymax'] = 1.0
                elif any(m.lower() in key.lower() for m in y0_measures):
                    kw['ymin'] = min(0.0, ydata.min())
                    if ignore_outliers:
                        low, kw['ymax'] = inlier_ylim([ydata])

                # NOTE: this is actually pretty slow
                fig.clf()
                ax = fig.gca()
                title = nice + '\n' + key
                kwplot.multi_plot(d['xdata'],
                                  ydata,
                                  ylabel=key,
                                  xlabel=mode,
                                  title=title,
                                  fnum=1,
                                  ax=ax,
                                  **kw)

                # png is slightly smaller than jpg for this kind of plot
                fpath = join(out_dpath, key + '.png')
                # print('save fpath = {!r}'.format(fpath))
                ax.figure.savefig(fpath)
Example #25
0
def detection_confusions(true_boxes, true_cxs, true_weights, pred_boxes,
                         pred_scores, pred_cxs, bg_weight=1.0, ovthresh=0.5,
                         bg_cls=-1):
    """
    Given predictions and truth for an image return (y_pred, y_true,
    y_score), which is suitable for sklearn classification metrics

    Args:
        true_boxes (ndarray): boxes in tlbr format
        true_cxs (ndarray): classes of each box
        true_weights (ndarray): weight of this each groundtruth item
        pred_boxes (ndarray): predicted boxes in tlbr format
        pred_scores (ndarray): scores for each prediction
        pred_cxs (ndarray): class predictions
        ovthresh (float): overlap threshold

        bg_weight (ndarray): weight of background predictions
          (default=1)

    Returns:
        pd.DataFrame: with relevant clf information

    Example:
        >>> true_boxes = np.array([[ 0,  0, 10, 10],
        >>>                        [10,  0, 20, 10],
        >>>                        [10,  0, 20, 10],
        >>>                        [20,  0, 30, 10]])
        >>> true_weights = np.array([1, 0, .9, 1])
        >>> bg_weight = 1.0
        >>> true_cxs = np.array([0, 0, 1, 1])
        >>> pred_boxes = np.array([[6, 2, 20, 10],
        >>>                        [3,  2, 9, 7],
        >>>                        [20,  0, 30, 10]])
        >>> pred_scores = np.array([.5, .5, .5])
        >>> pred_cxs = np.array([0, 0, 1])
        >>> y = detection_confusions(true_boxes, true_cxs, true_weights,
        >>>                          pred_boxes, pred_scores, pred_cxs,
        >>>                          bg_weight=bg_weight, ovthresh=.5)
        >>> pd.DataFrame(y)
        >>> print(y)  # xdoc: +IGNORE_WANT
           cx  pred  score  true  weight
        0   1     1 0.5000     1       1.0
        1   0     0 0.5000    -1       1.0
        2   0    -1 0.0000     0       1.0
        3   1    -1 0.0000     1       0.9
    """
    y_pred = []
    y_true = []
    y_score = []
    y_weight = []
    cxs = []

    if bg_weight is None:
        bg_weight = 1.0

    # Group true boxes by class
    # Keep track which true boxes are unused / not assigned
    cx_to_idxs = ub.group_items(range(len(true_cxs)), true_cxs)
    cx_to_unused = {cx: [True] * len(idxs)
                    for cx, idxs in cx_to_idxs.items()}

    # cx_to_boxes = ub.group_items(true_boxes, true_cxs)
    # cx_to_boxes = ub.map_vals(np.array, cx_to_boxes)

    # sort predictions by score
    sortx = pred_scores.argsort()[::-1]
    pred_boxes  = pred_boxes.take(sortx, axis=0)
    pred_cxs    = pred_cxs.take(sortx, axis=0)
    pred_scores = pred_scores.take(sortx, axis=0)
    for cx, box, score in zip(pred_cxs, pred_boxes, pred_scores):
        cls_true_idxs = cx_to_idxs.get(cx, [])

        ovmax = -np.inf
        ovidx = None
        weight = bg_weight

        if len(cls_true_idxs):
            cls_true_boxes = true_boxes.take(cls_true_idxs, axis=0)
            ovmax, ovidx = iou_overlap(cls_true_boxes, box)
            if true_weights is None:
                weight = 1.0
            else:
                true_idx = cls_true_idxs[ovidx]
                weight = true_weights[true_idx]
            unused = cx_to_unused[cx]

        if ovmax > ovthresh and unused[ovidx]:
            # Mark this prediction as a true positive
            if weight > 0:
                # Ignore matches to truth with weight 0 (difficult cases)
                y_pred.append(cx)
                y_true.append(cx)
                y_score.append(score)
                y_weight.append(weight)
                cxs.append(cx)
                unused[ovidx] = False
        else:
            # Mark this prediction as a false positive
            y_pred.append(cx)
            y_true.append(bg_cls)  # use -1 as background ignore class
            y_score.append(score)
            y_weight.append(weight)
            cxs.append(cx)

    # Mark true boxes we failed to predict as false negatives
    for cx, unused in cx_to_unused.items():
        for ovidx, flag in enumerate(unused):
            if flag:
                if true_weights is None:
                    weight = 1.0
                else:
                    cls_true_idxs = cx_to_idxs.get(cx, [])
                    true_idx = cls_true_idxs[ovidx]
                    weight = true_weights[true_idx]
                # if it has a nonzero weight
                if  weight > 0:
                    # Mark this prediction as a false negative
                    y_pred.append(-1)
                    y_true.append(cx)
                    y_score.append(0.0)
                    y_weight.append(weight)
                    cxs.append(cx)

    y = {
        'pred': y_pred,
        'true': y_true,
        'score': y_score,
        'weight': y_weight,
        'cx': cxs,
    }
    # y = pd.DataFrame()
    return y
Example #26
0
def autogen_imports(fpath_or_text):
    """
    Generate import statements for python code

    Example:
        >>> import vimtk
        >>> source = ub.codeblock(
            '''
            math
            it
            ''')
        >>> text = vimtk.autogen_imports(source)
        >>> print(text)
        import itertools as it
        import math
    """
    try:
        import xinspect
    except Exception:
        print('UNABLE TO IMPORT XINSPECT')
        print('sys.prefix = {!r}'.format(sys.prefix))
        raise
    from os.path import exists
    from xinspect.autogen import Importables
    importable = Importables()
    importable._use_recommended_defaults()

    base = {
        'it': 'import itertools as it',
        'nh': 'import netharn as nh',
        'np': 'import numpy as np',
        'pd': 'import pandas as pd',
        'ub': 'import ubelt as ub',
        'nx': 'import networkx as nx',
        'Image': 'from PIL import Image',
        'mpl': 'import matplotlib as mpl',
        'nn': 'from torch import nn',
        'torch_data': 'import torch.utils.data as torch_data',
        'F': 'import torch.nn.functional as F',
        'math': 'import math',
    }
    importable.known.update(base)

    user_importable = None
    try:
        user_importable = CONFIG.get('vimtk_auto_importable_modules')
        importable.known.update(user_importable)
    except Exception as ex:
        logger.info('ex = {!r}'.format(ex))
        logger.info('ERROR user_importable = {!r}'.format(user_importable))

    kw = {'importable': importable}
    if exists(fpath_or_text):
        kw['fpath'] = fpath_or_text
    else:
        kw['source'] = fpath_or_text
    lines = xinspect.autogen_imports(**kw)

    x = ub.group_items(lines, [x.startswith('from ') for x in lines])
    ordered_lines = []
    ordered_lines += sorted(x.get(False, []))
    ordered_lines += sorted(x.get(True, []))
    import_block = '\n'.join(ordered_lines)
    return import_block
Example #27
0
def main():
    """
    Run password security analysis

    Example:
        >>> import sys, ubelt
        >>> sys.path.append(ubelt.expandpath('~/misc/notes'))
        >>> from password_model import *  # NOQA
        >>> main()
    """
    import itertools as it
    from fractions import Fraction
    import pandas as pd
    # Build our adversary and our strategies
    devices, scales = build_threat_models()

    password_schemes = build_password_strategy()

    # Other estimates or assumptions
    estimates = {
        # estimated cost of using a kilowatt for an hour
        # http://www.wrecc.com/what-uses-watts-in-your-home/
        # https://www.coinwarz.com/mining/ethereum/calculator
        'dollars_per_kwh': 0.10,
    }

    rows = []
    for device, scheme, scale in it.product(devices, password_schemes, scales):
        for benchmark in device['benchmarks']:

            states = Fraction(scheme['states'])
            num_devices = Fraction(scale['num_devices'])
            dollars_per_kwh = Fraction(estimates['dollars_per_kwh'])

            hashmode_attempts_per_second = benchmark['attempts_per_second']
            attempts_per_second = num_devices * Fraction(
                int(hashmode_attempts_per_second))

            seconds = states / Fraction(attempts_per_second)

            hours = seconds / Fraction(3600)
            device_kilowatts = Fraction(device['watts']) / Fraction(1000)
            device_dollars_per_hour = device_kilowatts * dollars_per_kwh
            dollars_per_device = device_dollars_per_hour * hours
            dollars = dollars_per_device * num_devices

            total_kilowatts = device_kilowatts * num_devices * hours

            row = {
                'scheme': scheme['name'],
                'entropy': scheme['entropy'],
                'hashmode': benchmark['hashmode'],
                'hashmode_attempts_per_second':
                int(hashmode_attempts_per_second),
                'device': device['name'],
                'scale': scale['name'],
                'num_devices': scale['num_devices'],
                'seconds': seconds,
                'dollars': dollars,
                'kilowatts': total_kilowatts,
                'hours': hours,
                'dollars_per_kwh': estimates['dollars_per_kwh'],
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df = df.sort_values('entropy')

    chosen_device = 'RTX_3090'
    df = df[df['device'] == chosen_device]
    df['time'] = df['seconds'].apply(humanize_seconds)
    df['cost'] = df['dollars'].apply(partial(humanize_dollars, colored=1))
    df['entropy'] = df['entropy'].round(2)
    df['num_devices'] = df['num_devices'].apply(int)

    hashmodes = sorted([d['hashmode'] for d in device['benchmarks']])

    # https://github.com/pandas-dev/pandas/issues/18066
    monkeypatch_pandas_colored_stdout()

    # Output our assumptions
    print('\n---')
    print('Assumptions:')
    device_info = ub.group_items(devices,
                                 lambda x: x['name'])[chosen_device][0]
    print('estimates = {!r}'.format(estimates))
    print('device_info = {}'.format(ub.repr2(device_info, nl=2)))

    # For each hashmode, print the scheme-vs-num_devices-vs-time matrix
    hashmode_to_pivots = {}
    for hashmode in hashmodes:
        subdf = df
        subdf = subdf[subdf['hashmode'] == hashmode]
        subdf = subdf.sort_values(['entropy', 'num_devices'])
        piv = subdf.pivot(['entropy', 'cost', 'scheme'],
                          ['num_devices', 'scale'], 'time')
        # piv.style.applymap(color_cases)
        hashmode_to_pivots[hashmode] = piv

    for hashmode in hashmodes:
        print('\n---')
        print('hashmode = {!r}'.format(hashmode))
        piv = hashmode_to_pivots[hashmode]
        print(piv)

    # Print the scheme-vs-hashmode-vs-cost matrix
    print('\n---')
    print('Cost Matrix:')
    subdf = df[df['scale'] == df['scale'].iloc[0]]
    piv = subdf.pivot(['entropy', 'scheme'],
                      ['hashmode_attempts_per_second', 'hashmode'], 'cost')
    piv = piv.sort_index(axis=1, ascending=False)
    piv.columns = piv.columns.droplevel(0)
    print(piv)

    # Make the visualizations
    if ub.argflag('--show'):
        import kwplot
        from matplotlib.colors import LogNorm
        import matplotlib as mpl
        plt = kwplot.autoplt()
        sns = kwplot.autosns()

        use_latex = ub.argflag('--latex')
        if use_latex:
            mpl.rcParams['text.usetex'] = True

        def time_labelize(x):
            text = humanize_seconds(x, colored=False, named=True, precision=2)
            parts = text.split(' ')
            if use_latex:
                text = r'{\huge ' + parts[0] + '}' + '\n' + ' '.join(parts[1:])
            else:
                text = parts[0] + '\n' + ' '.join(parts[1:])
            return text

        def dollar_labelize(dollars):
            cost = humanize_dollars(dollars, named=(dollars > 1))
            if use_latex:
                cost = cost.replace('$', r'\$')
            return cost

        hashmode_to_notes = {}
        for dev in devices[0]['benchmarks']:
            hashmode_to_notes[dev['hashmode']] = dev['notes']

        if 1:
            # Independent of the adversary scale we can plot cost versus scheme
            # cost vs hashmod?
            subdf = df[df['scale'] == df['scale'].iloc[0]]
            piv = subdf.pivot(['entropy', 'scheme'],
                              ['hashmode_attempts_per_second', 'hashmode'],
                              'dollars')
            piv = piv.sort_index(axis=1, ascending=False)

            # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn
            ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1]

            annot = piv.applymap(dollar_labelize)
            piv = piv.applymap(float)

            sns.heatmap(piv,
                        annot=annot,
                        ax=ax,
                        fmt='s',
                        norm=LogNorm(vmin=1, vmax=100_000_000_000_000_000),
                        annot_kws={'size': 16},
                        cmap='cividis',
                        cbar_kws={
                            'label': 'dollars',
                            'pad': 0.001
                        })

            # Find colorbar
            for subax in ax.figure.axes:
                if subax.get_label() == '<colorbar>':
                    subax.set_ylabel('dollars', labelpad=0)
                    break

            new_ytick_labels = []
            for ent, scheme in piv.index.to_list():
                if use_latex:
                    scheme = r'{\LARGE ' + scheme + '}'
                _ = '{scheme}\nEntropy={ent}bits'.format(scheme=scheme,
                                                         ent=ent)
                new_ytick_labels.append(_)

            new_xtick_labels = []
            for _, hashmode in piv.columns.to_list():
                notes = ''
                if hashmode in hashmode_to_notes:
                    notes = '\n(' + hashmode_to_notes[hashmode] + ')'
                new_xtick_labels.append(hashmode + notes)

            ax.set_xticklabels(new_xtick_labels, rotation=0)
            ax.set_yticklabels(new_ytick_labels, rotation=0)

            ax.set_ylabel('Password Scheme, Entropy', labelpad=24)
            ax.set_xlabel('Hashmode', labelpad=16)

            if use_latex:
                title = '{{\\Huge Password Cost Security}}'
                ax.set_title(title)
            else:
                ax.set_title('Password Cost Security')

            ax.figure.subplots_adjust(bottom=0.1,
                                      left=0.20,
                                      right=1.0,
                                      top=0.90,
                                      wspace=0.001)

            if ub.argflag('--save'):
                fname = 'passwd_cost_security.png'
                ax.figure.savefig(fname)

        if 1:
            # For each hashmode plot (scheme versus adversary scale)
            for hashmode in ub.ProgIter(hashmodes, desc='plotting'):
                subdf = df
                subdf = subdf[subdf['hashmode'] == hashmode]
                subdf = subdf.sort_values(['entropy', 'num_devices'])

                piv = subdf.pivot(['entropy', 'dollars', 'scheme'],
                                  ['num_devices', 'scale'], 'seconds')
                piv = piv.applymap(float)

                # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn
                ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1]

                annot = piv.applymap(time_labelize)
                sns.heatmap(piv,
                            annot=annot,
                            ax=ax,
                            fmt='s',
                            norm=LogNorm(vmin=1, vmax=8640000000),
                            annot_kws={'size': 10},
                            cbar_kws={
                                'label': 'seconds',
                                'pad': 0.001
                            })

                # Find colorbar
                for subax in ax.figure.axes:
                    if subax.get_label() == '<colorbar>':
                        subax.set_ylabel('seconds', labelpad=0)
                        break

                new_ytick_labels = []
                for ent, dollars, scheme in piv.index.to_list():
                    cost = dollar_labelize(dollars)
                    if use_latex:
                        scheme = r'{\LARGE ' + scheme + '}'
                    _ = '{scheme}\nEntropy={ent}bits\nCost={cost}'.format(
                        scheme=scheme, cost=cost, ent=ent)
                    new_ytick_labels.append(_)

                new_xtick_labels = []
                for n, name in piv.columns.to_list():
                    if use_latex:
                        name = r'{\LARGE ' + name + '}'
                    _ = name + '\n' + named_large_number(n,
                                                         precision=0) + ' GPUs'
                    new_xtick_labels.append(_)

                ax.set_xticklabels(new_xtick_labels, rotation=0)
                # ax.set_yticklabels(new_ytick_labels, horizontalalignment='left', pad=30)
                ax.set_yticklabels(new_ytick_labels)

                ax.set_ylabel('Password Scheme, Entropy, and Cost to Crack',
                              labelpad=24)
                ax.set_xlabel('Adversary Resources', labelpad=16)

                notes = ''
                if hashmode in hashmode_to_notes:
                    notes = ' (' + hashmode_to_notes[hashmode] + ')'

                if use_latex:
                    title = '{{\\Huge Password Time Security}}\nhashmode={}{}'.format(
                        hashmode, notes)
                    ax.set_title(title)
                else:
                    ax.set_title(
                        'Password Time Security\n(hashmode={}{})'.format(
                            hashmode, notes))

                ax.figure.subplots_adjust(bottom=0.1,
                                          left=0.20,
                                          right=1.0,
                                          top=0.90,
                                          wspace=0.001)

                if ub.argflag('--save'):
                    fname = 'passwd_robustness_{}.png'.format(hashmode)
                    ax.figure.savefig(fname)
        plt.show()
Example #28
0
def draw_points(xy,
                color='blue',
                class_idxs=None,
                classes=None,
                ax=None,
                alpha=None,
                radius=1,
                **kwargs):
    """

    Args:
        xy (ndarray): of points.

    Example:
        >>> from kwplot.mpl_draw import *  # NOQA
        >>> import kwimage
        >>> xy = kwimage.Points.random(10).xy
        >>> draw_points(xy, radius=0.01)
        >>> draw_points(xy, class_idxs=np.random.randint(0, 3, 10),
        >>>         radius=0.01, classes=['a', 'b', 'c'], color='classes')

    Ignore:
        >>> import kwplot
        >>> kwplot.autompl()
    """
    import kwimage
    import matplotlib as mpl
    from matplotlib import pyplot as plt
    if ax is None:
        ax = plt.gca()

    xy = xy.reshape(-1, 2)

    # More grouped patches == more efficient runtime
    if alpha is None:
        alpha = [1.0] * len(xy)
    elif not ub.iterable(alpha):
        alpha = [alpha] * len(xy)

    if color == 'distinct':
        colors = kwimage.Color.distinct(len(alpha))
    elif color == 'classes':
        # TODO: read colors from categories if they exist
        if class_idxs is None or classes is None:
            raise Exception(
                'cannot draw class colors without class_idxs and classes')
        try:
            cls_colors = kwimage.Color.distinct(len(classes))
        except KeyError:
            raise Exception(
                'cannot draw class colors without class_idxs and classes')
        import kwarray
        _keys, _vals = kwarray.group_indices(class_idxs)
        colors = list(ub.take(cls_colors, class_idxs))
    else:
        colors = [color] * len(alpha)

    ptcolors = [
        kwimage.Color(c, alpha=a).as01('rgba') for c, a in zip(colors, alpha)
    ]
    color_groups = ub.group_items(range(len(ptcolors)), ptcolors)

    circlekw = {
        'radius': radius,
        'fill': True,
        'ec': None,
    }
    if 'fc' in kwargs:
        import warnings
        warnings.warning('Warning: specifying fc to Points.draw overrides '
                         'the color argument. Use color instead')
    circlekw.update(kwargs)
    fc = circlekw.pop('fc', None)  # hack

    collections = []
    for pcolor, idxs in color_groups.items():

        # hack for fc
        if fc is not None:
            pcolor = fc

        patches = [
            mpl.patches.Circle((x, y), fc=pcolor, **circlekw)
            for x, y in xy[idxs]
        ]
        col = mpl.collections.PatchCollection(patches, match_original=True)
        collections.append(col)
        ax.add_collection(col)
    return collections
Example #29
0
def draw_boxes(boxes,
               alpha=None,
               color='blue',
               labels=None,
               centers=False,
               fill=False,
               ax=None,
               lw=2):
    """
    Args:
        boxes (kwimage.Boxes):
        labels (List[str]): of labels
        alpha (List[float]): alpha for each box
        centers (bool): draw centers or not
        lw (float): linewidth

    Example:
        >>> import kwimage
        >>> bboxes = kwimage.Boxes([[.1, .1, .6, .3], [.3, .5, .5, .6]], 'xywh')
        >>> draw_boxes(bboxes)
        >>> #kwplot.autompl()
    """
    import kwplot
    import matplotlib as mpl
    from matplotlib import pyplot as plt
    if ax is None:
        ax = plt.gca()

    xywh = boxes.to_xywh().data

    transparent = kwplot.Color((0, 0, 0, 0)).as01('rgba')

    # More grouped patches == more efficient runtime
    if alpha is None:
        alpha = [1.0] * len(xywh)
    elif not ub.iterable(alpha):
        alpha = [alpha] * len(xywh)

    edgecolors = [kwplot.Color(color, alpha=a).as01('rgba') for a in alpha]
    color_groups = ub.group_items(range(len(edgecolors)), edgecolors)
    for edgecolor, idxs in color_groups.items():
        if fill:
            fc = edgecolor
        else:
            fc = transparent
        rectkw = dict(ec=edgecolor, fc=fc, lw=lw, linestyle='solid')
        patches = [
            mpl.patches.Rectangle((x, y), w, h, **rectkw)
            for x, y, w, h in xywh[idxs]
        ]
        col = mpl.collections.PatchCollection(patches, match_original=True)
        ax.add_collection(col)

    if centers not in [None, False]:
        default_centerkw = {
            # 'radius': 1,
            'fill': True
        }
        centerkw = default_centerkw.copy()
        if isinstance(centers, dict):
            centerkw.update(centers)
        xy_centers = boxes.xy_center
        for fcolor, idxs in color_groups.items():
            # TODO: radius based on size of bbox
            # if 'radius' not in centerkw:
            #     boxes.area[idxs]

            patches = [
                mpl.patches.Circle((x, y), ec=None, fc=fcolor, **centerkw)
                for x, y in xy_centers[idxs]
            ]
            col = mpl.collections.PatchCollection(patches, match_original=True)
            ax.add_collection(col)

    if labels:
        texts = []
        default_textkw = {
            'horizontalalignment':
            'left',
            'verticalalignment':
            'top',
            'backgroundcolor': (0, 0, 0, .8),
            'color':
            'white',
            'fontproperties':
            mpl.font_manager.FontProperties(size=6, family='monospace'),
        }
        tkw = default_textkw.copy()
        for (x1, y1, w, h), label in zip(xywh, labels):
            texts.append((x1, y1, label, tkw))
        for (x1, y1, catname, tkw) in texts:
            ax.text(x1, y1, catname, **tkw)
Example #30
0
def main():
    # TODO: progressive hashing data structure
    inv1 = Inventory('/media/joncrall/raid/', blocklist)
    inv2 = Inventory('/media/joncrall/media', blocklist)

    # inv1 = Inventory('/media/joncrall/raid/Applications/NotGames', blocklist)
    # inv2 = Inventory('/media/joncrall/media/Applications/NotGames', blocklist)
    # inv1 = Inventory('/media/joncrall/raid/Applications', blocklist)
    # inv2 = Inventory('/media/joncrall/media/Applications', blocklist)

    self = inv1  # NOQA

    inv1.build()
    inv2.build()

    thresh = {
        'frac': 0.5,
        'byte':
        100 * int(2**20)  # only use the first few mb to determine overlap
    }
    verbose = 1
    pfiles1 = inv1.pfiles
    pfiles2 = inv2.pfiles
    overlap, only1, only2 = ProgressiveFile.likely_overlaps(pfiles1,
                                                            pfiles2,
                                                            thresh=thresh,
                                                            verbose=verbose)

    stats = {
        'overlap': len(overlap),
        'only1': len(only1),
        'only2': len(only2),
    }
    print('stats = {}'.format(ub.repr2(stats, nl=1)))
    only2_list = sorted([p.fpath for group in only2.values() for p in group])
    print('only2_list = {}'.format(ub.repr2(only2_list, nl=1)))
    print('stats = {}'.format(ub.repr2(stats, nl=1)))

    # for pfile in inv1.pfiles:
    #     pfile._check_integrity()

    import numpy as np
    mb_read = np.array([
        pfile._parts[-1][1] / int(2**20) for pfile in ub.ProgIter(inv2.pfiles)
    ])
    mb_read.max()
    mb_read.min()

    # Build all hashes up to a reasonable degree
    inv1.build_hashes(max_workers=0)

    maybe_dups = inv1.likely_duplicates(thresh=0.2)
    len(maybe_dups)

    maybe_dups = ub.sorted_keys(maybe_dups, key=lambda x: x[2])

    import networkx as nx
    import itertools as it
    # Check which directories are most likely to be duplicates
    graph = nx.Graph()

    for key, group in ub.ProgIter(maybe_dups.items(),
                                  total=len(maybe_dups),
                                  desc='build dup dir graph'):
        if key[0] == '':
            continue
        dpaths = [dirname(pfile.fpath) for pfile in group]
        for d1, d2 in it.combinations(dpaths, 2):
            graph.add_edge(d1, d2)
            edge = graph.edges[(d1, d2)]
            if 'dups' not in edge:
                edge['dups'] = 0
            edge['dups'] += 1

    edge_data = list(graph.edges(data=True))

    for dpath in ub.ProgIter(graph.nodes, desc='find lens'):
        num_children = len(os.listdir(dpath))
        graph.nodes[dpath]['num_children'] = num_children

    for d1, d2, dat in edge_data:
        nc1 = graph.nodes[d1]['num_children']
        nc2 = graph.nodes[d2]['num_children']
        ndups = dat['dups']
        dup_score = (dat['dups'] / min(nc1, nc2))
        dat['dup_score'] = dup_score
        if dup_score > 0.9:
            print('dup_score = {!r}'.format(dup_score))
            print('d1 = {!r}'.format(d1))
            print('d2 = {!r}'.format(d2))
            print('nc1 = {!r}'.format(nc1))
            print('nc2 = {!r}'.format(nc2))
            print('ndups = {!r}'.format(ndups))

    print('edge_data = {}'.format(ub.repr2(edge_data, nl=2)))

    print('maybe_dups = {}'.format(ub.repr2(maybe_dups.keys(), nl=3)))
    for key, group in maybe_dups.items():
        if key[0] == '':
            continue
        print('key = {!r}'.format(key))
        print('group = {}'.format(ub.repr2(group, nl=1)))
        for pfile in group:
            pfile.refined_to(float('inf'))

        print('key = {!r}'.format(key))

    inv2.build_hashes(max_workers=6, mode='thread')

    inv1.pfiles = [
        p for p in ub.ProgIter(inv1.pfiles, desc='exist check')
        if exists(p.fpath)
    ]
    inv2.pfiles = [
        p for p in ub.ProgIter(inv2.pfiles, desc='exist check')
        if exists(p.fpath)
    ]

    pfiles1 = inv1.pfiles
    pfiles2 = inv2.pfiles

    def compute_likely_overlaps(pfiles1, pfiles2):
        step_idx1 = ProgressiveFile.compatible_step_idx(pfiles1)
        step_idx2 = ProgressiveFile.compatible_step_idx(pfiles2)
        step_idx = min(step_idx1, step_idx2)
        grouped1 = ProgressiveFile.group_pfiles(pfiles1, step_idx=step_idx)
        grouped2 = ProgressiveFile.group_pfiles(pfiles2, step_idx=step_idx)

        thresh = 0.2
        verbose = 1

        # TODO: it would be nice if we didn't have to care about internal
        # deduplication when we attempt to find cross-set overlaps
        dups1 = ProgressiveFile.likely_duplicates(inv1.pfiles,
                                                  thresh=thresh,
                                                  verbose=verbose)
        dups2 = ProgressiveFile.likely_duplicates(inv2.pfiles,
                                                  thresh=thresh,
                                                  verbose=verbose)

        pfiles = inv1.pfiles + inv2.pfiles
        dups3 = ProgressiveFile.likely_duplicates(pfiles,
                                                  thresh=thresh,
                                                  verbose=verbose)

        only_on_inv2 = {}
        for key, group in dups3.items():
            if not any(
                    item.fpath.startswith(inv1.root_fpath) for item in group):
                only_on_inv2[key] = group

        for p1 in inv1.pfiles:
            if 'Chase HQ 2 (JUE) [!].zip' in p1.fpath:
                break

        for p2 in inv2.pfiles:
            if 'Chase HQ 2 (JUE) [!].zip' in p2.fpath:
                break

        look = list(ub.flatten(only_on_inv2.values()))
        takealook = sorted([p.fpath for p in look])
        print('takealook = {}'.format(ub.repr2(takealook, nl=1)))

        keys1 = set(grouped1)
        keys2 = set(grouped2)

        missing_keys2 = keys2 - keys1
        missing_groups2 = ub.dict_subset(grouped2, missing_keys2)

        missing_fpaths2 = []
        for key, values in missing_groups2.items():
            print('key = {!r}'.format(key))
            print('values = {}'.format(ub.repr2(values, nl=1)))
            missing_fpaths2.extend(values)

        missing_fpaths2 = sorted([p.fpath for p in missing_fpaths2])
        print('missing_fpaths2 = {}'.format(ub.repr2(missing_fpaths2, nl=1)))
        # pass

        import xdev
        set_overlaps = xdev.set_overlaps(keys1, keys2)
        print('set_overlaps = {}'.format(ub.repr2(set_overlaps, nl=1)))
        # We want to know what files in set2 do not exist in set1

    if 0:
        fpath = inv1.all_fpaths[0]
        pfile = ProgressiveFile(fpath)

        fpath1 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Transfer/Zebras/DownloadedLibraries/lightspeed/solve_triu.m'
        fpath2 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Zebras/downloaded_libraries/lightspeed/solve_triu.m'

        fpath1 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Falco/DarkFalco02.pcs'
        fpath2 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Ivysaur/Kraid-v2-Ivy.pcs'

        pfile = pfile1 = ProgressiveFile(fpath1)
        pfile2 = ProgressiveFile(fpath2)

        pfile.maybe_equal(pfile2, thresh=0.1)

        fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500]
        # fpaths = hash_groups1_dup['ef46db3751d8e999']
        pfiles_demodata = [ProgressiveFile(f) for f in fpath_demodata]

        def progressive_duplicates(pfiles, idx=1):
            step_ids = [pfile.refined_to(idx) for pfile in ub.ProgIter(pfiles)]
            final_groups = {}
            grouped = ub.group_items(pfiles, step_ids)
            for key, group in grouped.items():
                if len(group) > 1:
                    if all(not g.can_refine for g in group):
                        # Group is ~100% a real duplicate
                        final_groups[key] = group
                    else:
                        pfiles = group
                        deduped = progressive_duplicates(pfiles, idx=idx + 1)
                        final_groups.update(deduped)
                else:
                    final_groups[key] = group
            return final_groups

        pfiles = pfiles_demodata
        final_groups = progressive_duplicates(pfiles)

        for key, group in final_groups.items():
            if len(group) > 1:
                print('key = {!r}'.format(key))
                print('group = {}'.format(ub.repr2(group, nl=1)))

        inv1.build_hashes()
        inv2.build_hashes()

        hash_groups1 = ub.group_items(inv1.all_fpaths, inv1.all_hashes)
        hash_groups2 = ub.group_items(inv2.all_fpaths, inv2.all_hashes)

        hash_groups1_dup = {
            k: v
            for k, v in hash_groups1.items() if len(v) > 1
        }
        hash_groups2_dup = {
            k: v
            for k, v in hash_groups2.items() if len(v) > 1
        }
        len(hash_groups1_dup)
        len(hash_groups2_dup)

        # common = set(hash_groups1) & set(hash_groups2)
        # xdev.set_overlaps(hash_groups1, hash_groups2)

        fnames1 = ub.group_items(inv1.all_fpaths, key=basename)
        fnames2 = ub.group_items(inv2.all_fpaths, key=basename)

        missing = ub.dict_diff(fnames2, fnames1)
        sorted(ub.flatten(missing.values()))
        len(missing)

        fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500]

        def internal_deduplicate(self):
            hash_groups = ub.group_items(self.all_fpaths, self.all_hashes)
            hash_groups_dup = {
                k: v
                for k, v in hash_groups.items() if len(v) > 1
            }

            from os.path import dirname

            hash_groups_dup['ef46db3751d8e999']

            for key, values in hash_groups_dup.items():
                for v in values:
                    if v.endswith('.avi'):
                        break

                [basename(v) for v in values]
                [dirname(v) for v in values]