Beispiel #1
0
def test_multispectral_sql():
    try:
        import sqlalchemy
    except Exception:
        import pytest
        pytest.skip()

    import numpy as np
    import kwcoco
    import ubelt as ub
    dset1 = kwcoco.CocoDataset.demo('vidshapes1-multispectral')
    dset2 = dset1.view_sql(force_rewrite=True)

    dset2.basic_stats()

    name = ub.peek(dset1.index.name_to_img)
    img_dict = dset2.index.name_to_img[name]
    assert img_dict['name'] == name

    # file_name = ub.peek(dset1.index.file_name_to_img)
    # img_dict = dset2.index.name_to_img[name]
    # assert img_dict['name'] == name

    img1 = dset1.load_image(1, channels='B1')
    img2 = dset2.load_image(1, channels='B1')

    assert np.all(img1 == img2)
Beispiel #2
0
def motherboard_info():
    """
    REQUIRES SUDO

    xdoctest -m ~/misc/notes/buildapc.py motherboard_info
    """
    import re
    info = ub.cmd('sudo dmidecode -t 9')
    pcie_slots = []
    chunks = info['out'].split('\n\n')
    for chunk in chunks:
        item = {}
        for line in chunk.split('\n'):
            # doesn't get all data correctly (e.g. characteristics)
            parts = re.split('\t*:', line, maxsplit=1)
            if len(parts) == 2:
                key, val = parts
                key = key.strip()
                val = val.strip()
                if key in item:
                    raise KeyError(f'key={key} already exists')
                item[key] = val
        if item:
            item = ub.map_keys(slugify_key, item)
            pcie_slots.append(item)

    pcie_usage = ub.dict_hist(item['current_usage'] for item in pcie_slots)

    _varied = varied_values(pcie_slots, min_variations=0)
    _varied = ub.map_keys(slugify_key, _varied)
    unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1}
    varied = {k: v for k, v in _varied.items() if len(v) > 1}

    print(info['out'])
Beispiel #3
0
    def difference(self, other):
        """
        Set difference

        Example:
            >>> self = ChannelSpec('rgb|disparity,flowx|flowy')
            >>> other = ChannelSpec('rgb')
            >>> self.difference(other)
            >>> other = ChannelSpec('flowx')
            >>> self.difference(other)
        """
        assert len(list(other.keys())) == 1, 'can take diff with one stream'
        other_norm = ub.oset(ub.peek(other.normalize().values()))
        self_norm = self.normalize()

        new_streams = []
        for key, parts in self_norm.items():
            new_parts = ub.oset(parts) - ub.oset(other_norm)
            # shrink the representation of a complex r|g|b to an alias if
            # possible.
            # TODO: make this more efficient
            for alias, alias_spec in self._known.items():
                alias_parts = ub.oset(alias_spec.split('|'))
                index = subsequence_index(new_parts, alias_parts)
                if index is not None:
                    oset_delitem(new_parts, index)
                    oset_insert(new_parts, index.start, alias)
            new_stream = '|'.join(new_parts)
            new_streams.append(new_stream)
        new_spec = ','.join(new_streams)
        new = self.__class__(new_spec)
        return new
Beispiel #4
0
    def forward(self, inputs):
        """
        Args:
            inputs (Tensor | dict): Either the input images  (as a regulary
                pytorch BxCxHxW Tensor) or a dictionary mapping input
                modalities to the input imges.

        Returns:
             Dict[str, Tensor]: model output wrapped in a dictionary so its
                 clear what the return type is. In this case "energy" is class
                 probabilities **before** softmax / normalization is applied.
        """
        if isinstance(inputs, dict):
            # TODO: handle channel modalities later
            assert len(inputs) == 1, (
                'only support one fused stream: e.g. rgb for now ')
            im = ub.peek(inputs.values())
        else:
            im = inputs

        im = self.input_norm(im)
        class_energy = self.model(im)
        outputs = {
            'class_energy': class_energy,
        }
        return outputs
Beispiel #5
0
def parse_mscoco():
    # Test that our implementation can handle the real mscoco data
    root = ub.expandpath('~/data/standard_datasets/mscoco/')

    fpath = join(root, 'annotations/instances_val2014.json')
    img_root = normpath(ub.ensuredir((root, 'images', 'val2014')))

    # fpath = join(root, 'annotations/stuff_val2017.json')
    # img_root = normpath(ub.ensuredir((root, 'images', 'val2017')))

    import ujson
    dataset = ujson.load(open(fpath, 'rb'))

    import ndsampler
    dset = ndsampler.CocoDataset(dataset)
    dset.img_root = img_root

    gid_iter = iter(dset.imgs.keys())

    gid = ub.peek(gid_iter)

    for gid in ub.ProgIter(gid_iter):
        img = dset.imgs[gid]
        ub.grabdata(img['coco_url'], dpath=img_root, verbose=0)
        anns = [dset.anns[aid] for aid in dset.gid_to_aids[gid]]
        dset.show_image(gid=gid)

    ann = anns[0]

    segmentation = ann['segmentation']

    from PIL import Image
    gpath = join(dset.img_root, img['file_name'])
    with Image.open(gpath) as pil_img:
        np_img = np.array(pil_img)
Beispiel #6
0
 def _resolve(_types):
     if len(_types) == 1:
         return ub.peek(_types)
     else:
         if unions:
             return ' | '.join(sorted(_types))
         else:
             return 'Any'
Beispiel #7
0
def rank_inventory(inventory):
    candidates = list(ub.flatten(list(pkmn.family(ancestors=False, node=True))
                                 for pkmn in inventory))

    groups = ub.group_items(candidates, key=lambda p: p.name)

    leages = {
        'master': {'max_cp': float('inf')},
        'ultra': {'max_cp': 2500},
        'great': {'max_cp': 1500},
        'little': {'max_cp': 500},
    }

    max_level = 45  # for XL candy
    # max_level = 40  # normal

    all_dfs = []

    for name, group in groups.items():
        print('\n\n------------\n\n')
        print('name = {!r}'.format(name))
        for leage_name, leage_filters in leages.items():
            max_cp = leage_filters['max_cp']
            print('')
            print(' ========== ')
            print(' --- {} in {} --- '.format(name, leage_name))
            not_eligible = [p for p in group if p.cp is not None and p.cp > max_cp]
            eligible = [p for p in group if p.cp is None or p.cp <= max_cp]
            print('not_eligible = {!r}'.format(not_eligible))
            if len(eligible) > 0:
                first = ub.peek(eligible)
                have_ivs = eligible
                df = first.leage_rankings_for(have_ivs, max_cp=max_cp,
                                              max_level=max_level)
                all_dfs.append(df)
            else:
                print('none eligable')

    # Print out the best ranks for each set of IVS over all possible forms
    # (lets you know which ones can be transfered safely)

    iv_to_rank = ub.ddict(list)
    for df in all_dfs:
        if df is not None:
            df = df.set_index(['iva', 'ivd', 'ivs'])
            for iv, rank in zip(df.index, df['rank']):
                iv_to_rank[iv].append(rank)

    iv_to_best_rank = ub.map_vals(sorted, iv_to_rank)
    iv_to_best_rank = ub.sorted_vals(iv_to_best_rank)
    print('iv_to_best_rank = {}'.format(ub.repr2(iv_to_best_rank, nl=1, align=':')))
Beispiel #8
0
def parse_cpu_info(percore=False):
    """
    Get a nice summary of CPU information

    Requirements:
        pip install python-slugify

    Ignore:
        cpu_info = parse_cpu_info()
        print(cpu_info['varied']['cpu_mhz'])
        print('cpu_info = {}'.format(ub.repr2(cpu_info, nl=3)))

    Notes:
        * lscpu
    """
    # ALSO
    import cpuinfo
    cpu_info = cpuinfo.get_cpu_info()

    import re
    info = ub.cmd('cat /proc/cpuinfo')
    cpu_lines = info['out'].split('\n\n')
    cores = []
    for lines in cpu_lines:
        core = {}
        for line in lines.split('\n'):
            parts = re.split('\t*:', line, maxsplit=1)
            if len(parts) == 2:
                key, val = parts
                key = key.strip()
                val = val.strip()
                if key in core:
                    raise KeyError(f'key={key} already exists')
                core[key] = val
        if len(core):
            core = ub.map_keys(slugify_key, core)
            cores.append(core)
    _varied = varied_values(cores, min_variations=0)
    unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1}
    varied = {k: v for k, v in _varied.items() if len(v) > 1}

    cpu_info = {
        'varied': varied,
        'unvaried': unvaried,
    }
    if percore:
        cpu_info['cores'] = cores
    return cpu_info
Beispiel #9
0
def _broadcast_colors(color, num, img, colorspace):
    """
    Determine if color applies a single color to all ``num`` items, or if it is
    a list of colors for each item. Return as a list of colors for each item.

    TODO:
        - [ ] add as classmethod of kwimage.Color

    Example:
        >>> img = (np.random.rand(512, 512, 3) * 255).astype(np.uint8)
        >>> colorspace = 'rgb'
        >>> color = color_str_list = ['red', 'green', 'blue']
        >>> color_str = 'red'
        >>> num = 3
        >>> print(_broadcast_colors(color_str_list, num, img, colorspace))
        >>> print(_broadcast_colors(color_str, num, img, colorspace))
        >>> colors_tuple_list = _broadcast_colors(color_str_list, num, img, colorspace)
        >>> print(_broadcast_colors(colors_tuple_list, num, img, colorspace))
        >>> #
        >>> # FIXME: This case seems broken
        >>> colors_ndarray_list = np.array(_broadcast_colors(color_str_list, num, img, colorspace))
        >>> print(_broadcast_colors(colors_ndarray_list, num, img, colorspace))
    """
    # Note there is an ambiguity when num=3 and color=[int, int, int]
    # that must be resolved by checking num channels in the image
    import kwimage
    import ubelt as ub
    import numbers

    needs_broadcast = True  # assume the list wasnt given by default
    if ub.iterable(color):
        first = ub.peek(color)
        if len(color) == num:
            if len(color) <= 4 and isinstance(first, numbers.Number):
                # ambiguous case, interpret as a single broadcastable color
                needs_broadcast = True
            else:
                # This is the only case we dont need broadcast
                needs_broadcast = False

    if needs_broadcast:
        color = kwimage.Color(color)._forimage(img, colorspace)
        colors = [color] * num
    else:
        colors = [kwimage.Color(c)._forimage(img, colorspace) for c in color]
    return colors
Beispiel #10
0
    def __init__(repo, **kwargs):
        repo.name = kwargs.pop('name', None)
        repo.dpath = kwargs.pop('dpath', None)
        repo.code_dpath = kwargs.pop('code_dpath', None)
        repo.remotes = kwargs.pop('remotes', None)
        repo.remote = kwargs.pop('remote', None)
        repo.branch = kwargs.pop('branch', 'master')

        repo._logged_lines = []
        repo._logged_cmds = []

        if repo.remote is None:
            if repo.remotes is None:
                raise ValueError('must specify some remote')
            else:
                if len(repo.remotes) > 1:
                    raise ValueError('remotes are ambiguous, specify one')
                else:
                    repo.remote = ub.peek(repo.remotes)
        else:
            if repo.remotes is None:
                _default_remote = 'origin'
                repo.remotes = {_default_remote: repo.remote}
                repo.remote = _default_remote

        repo.url = repo.remotes[repo.remote]

        if repo.name is None:
            suffix = repo.url.split('/')[-1]
            repo.name = suffix.split('.git')[0]

        if repo.dpath is None:
            repo.dpath = join(repo.code_dpath, repo.name)

        repo.pkg_dpath = join(repo.dpath, repo.name)

        for path_attr in ['dpath', 'code_dpath']:
            path = getattr(repo, path_attr)
            if path is not None:
                setattr(repo, path_attr, ub.expandpath(path))

        repo.verbose = kwargs.pop('verbose', 3)
        if kwargs:
            raise ValueError('unknown kwargs = {}'.format(kwargs.keys()))

        repo._pygit = None
Beispiel #11
0
 def from_coco(cls, data, dims=None):
     """
     Accepts either new-style or old-style coco polygons
     """
     if isinstance(data, list):
         if len(data) > 0:
             assert isinstance(ub.peek(data), numbers.Number)
             exterior = np.array(data).reshape(-1, 2)
             self = cls(exterior=exterior)
         else:
             self = cls(exterior=[])
     elif isinstance(data, dict):
         assert 'exterior' in data
         self = cls(**data)
     else:
         raise TypeError(type(data))
     return self
Beispiel #12
0
    def __init__(self,
                 arch='resnet50',
                 classes=1000,
                 channels='rgb',
                 input_stats=None):
        super(ClfModel, self).__init__()

        import ndsampler
        if input_stats is None:
            input_stats = {}
        input_norm = nh.layers.InputNorm(**input_stats)

        self.classes = ndsampler.CategoryTree.coerce(classes)

        self.channels = ChannelSpec.coerce(channels)
        chann_norm = self.channels.normalize()
        assert len(chann_norm) == 1
        in_channels = len(ub.peek(chann_norm.values()))
        num_classes = len(self.classes)

        if arch == 'resnet50':
            from torchvision import models
            model = models.resnet50()
            new_conv1 = torch.nn.Conv2d(in_channels,
                                        64,
                                        kernel_size=7,
                                        stride=3,
                                        padding=3,
                                        bias=False)
            new_fc = torch.nn.Linear(2048, num_classes, bias=True)
            new_conv1.weight.data[:, 0:
                                  in_channels, :, :] = model.conv1.weight.data[
                                      0:, 0:in_channels, :, :]
            new_fc.weight.data[0:num_classes, :] = model.fc.weight.data[
                0:num_classes, :]
            new_fc.bias.data[0:num_classes] = model.fc.bias.data[0:num_classes]
            model.fc = new_fc
            model.conv1 = new_conv1
        else:
            raise KeyError(arch)

        self.input_norm = input_norm
        self.model = model

        self.coder = ClfCoder(self.classes)
Beispiel #13
0
def module_version_infos():
    """

    References:
        https://packaging.python.org/guides/single-sourcing-package-version/
    """
    try:
        from importlib import metadata
    except ImportError:
        # Running on pre-3.8 Python; use importlib-metadata package
        import importlib_metadata as metadata
    import sys
    modnames = ['torch', 'cv2', 'netharn', 'PIL', 'numpy']
    infos = []
    for modname in modnames:
        info = {'name': modname}

        try:
            module = sys.modules[modname]
            version_0 = getattr(module, '__version__', None)
        except Exception:
            version_0 = None

        try:
            version_1 = metadata.version(modname)
        except Exception:
            version_1 = None

        possible_versions = {version_1, version_0} - {None}
        if len(possible_versions) == 1:
            info['version'] = ub.peek(possible_versions)
        else:
            info['possible_versions'] = possible_versions

        if modname == 'torch':
            info['torch.version.cuda'] = torch.version.cuda
            info['torch.cuda.is_available()'] = torch.cuda.is_available()

        infos.append(info)

    # The conda info step is too slow (3 seconds)
    from netharn.util.collect_env import get_env_info
    env_info = get_env_info()._asdict()
    info['__env__'] = env_info
Beispiel #14
0
def _kwiver_to_kwimage_detections(detected_objects):
    """
    Convert vital detected object sets to kwimage.Detections

    Args:
        detected_objects (kwiver.vital.types.DetectedObjectSet)

    Returns:
        kwimage.Detections
    """
    import ubelt as ub
    import kwimage
    boxes = []
    scores = []
    class_idxs = []

    classes = []
    if len(detected_objects) > 0:
        obj = ub.peek(detected_objects)
        classes = obj.type().all_class_names()

    for obj in detected_objects:
        box = obj.bounding_box()
        tlbr = [box.min_x(), box.min_y(), box.max_x(), box.max_y()]
        score = obj.confidence()
        cname = obj.type().get_most_likely_class()
        cidx = classes.index(cname)
        boxes.append(tlbr)
        scores.append(score)
        class_idxs.append(cidx)

    dets = kwimage.Detections(
        boxes=kwimage.Boxes(np.array(boxes), 'tlbr'),
        scores=np.array(scores),
        class_idxs=np.array(class_idxs),
        classes=classes,
    )
    return dets
Beispiel #15
0
def _main_device_id_from_data(item):
    """
    Get device ids of a model

    Example:
        >>> device_ids = _main_device_id_from_data(torch.randn(3))
        >>> print('device_ids = {!r}'.format(device_ids))
        >>> if torch.cuda.is_available():
        >>>     device_ids = _main_device_id_from_data(torch.randn(3).to('cuda'))
        >>>     print('device_ids = {!r}'.format(device_ids))
        >>>     for i in range(torch.cuda.device_count()):
        >>>         device_ids = _main_device_id_from_data(torch.randn(3).to(i))
        >>>         print('device_ids = {!r}'.format(device_ids))
    """
    if hasattr(item, 'device'):
        return item.device.index
    if hasattr(item, 'is_cuda'):
        if item.is_cuda:
            return item.get_device().index
        else:
            return None
    elif hasattr(item, 'state_dict'):
        devices = [item.device for item in item.state_dict().values()]
        _device_ids = set()
        for device in devices:
            if device.type == 'cuda':
                index = device.index or 0
                _device_ids.add(index)
            else:
                _device_ids.add(None)
        try:
            _device_ids = sorted(_device_ids)
        except TypeError:
            raise Exception('cannot currently mix CPU and GPU')
        _device_id = ub.peek(_device_ids)
        return _device_id
    else:
        raise TypeError(type(item))
Beispiel #16
0
    def show_sample(self):
        """
        CommandLine:
            python ~/code/netharn/netharn/examples/ggr_matching.py RandomBalancedIBEISSample.show_sample --show

        Example:
            >>> import sys
            >>> sys.path.append('/home/joncrall/code/netharn/examples')
            >>> from ggr_matching import *
            >>> self = RandomBalancedIBEISSample.from_dbname('PZ_MTEST')
            >>> nh.util.autompl()
            >>> self.show_sample()
            >>> nh.util.show_if_requested()
        """
        vis_dataloader = torch.utils.data.DataLoader(self,
                                                     shuffle=True,
                                                     batch_size=8)
        example_batch = ub.peek(vis_dataloader)

        concatenated = torch.cat((example_batch[0], example_batch[1]), 0)
        tensor = torchvision.utils.make_grid(concatenated)
        im = tensor.numpy().transpose(1, 2, 0)
        nh.util.imshow(im)
Beispiel #17
0
def benchmark_hash_file():
    """
    CommandLine:
        python ~/code/ubelt/dev/bench_hash.py --show
        python ~/code/ubelt/dev/bench_hash.py --show
    """
    import ubelt as ub
    import random

    # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp'))
    dpath = ub.ensuredir(ub.expandpath('$HOME/tmp'))

    rng = random.Random(0)
    # Create a pool of random chunks of data
    chunksize = int(2 ** 20)
    pool_size = 8
    part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)]

    #ITEM = 'JUST A STRING' * 100
    HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3']

    scales = list(range(5, 10))
    import os

    results = ub.AutoDict()
    # Use json is faster or at least as fast it most cases
    # xxhash is also significantly faster than sha512
    ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms')
    for s in ub.ProgIter(scales, desc='benchmark', verbose=3):
        N = 2 ** s
        print(' --- s={s}, N={N} --- '.format(s=s, N=N))
        # Write a big file
        size_pool = [N]
        fpath = _write_random_file(dpath, part_pool, size_pool, rng)

        megabytes = os.stat(fpath).st_size / (2 ** 20)
        print('megabytes = {!r}'.format(megabytes))

        for hasher in HASHERS:
            for timer in ti.reset(hasher):
                ub.hash_file(fpath, hasher=hasher)
            results[hasher].update({N: ti.mean()})
        col = {h: results[h][N] for h in HASHERS}
        sortx = ub.argsort(col)
        ranking = ub.dict_subset(col, sortx)
        print('walltime: ' + ub.repr2(ranking, precision=9, nl=0))
        best = next(iter(ranking))
        #pairs = list(ub.iter_window( 2))
        pairs = [(k, best) for k in ranking]
        ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs]
        nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs]
        relratios = ub.odict(zip(nicekeys, ratios))
        print('speedup: ' + ub.repr2(relratios, precision=4, nl=0))
    # xdoc +REQUIRES(--show)
    # import pytest
    # pytest.skip()
    import pandas as pd
    df = pd.DataFrame.from_dict(results)
    df.columns.name = 'hasher'
    df.index.name = 'N'
    ratios = df.copy().drop(columns=df.columns)
    for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]:
        ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2]
    print()
    print('Seconds per iteration')
    print(df.to_string(float_format='%.9f'))
    print()
    print('Ratios of seconds')
    print(ratios.to_string(float_format='%.2f'))
    print()
    print('Average Ratio (over all N)')
    print(ratios.mean().sort_values())
    if ub.argflag('--show'):
        import kwplot
        kwplot.autompl()
        xdata = sorted(ub.peek(results.values()).keys())
        ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results)
        kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds')
        kwplot.show_if_requested()
Beispiel #18
0
    def get_pokemon_info(api, name, form=None):
        """
        Example:
            >>> from pypogo.pogo_api import *  # NOQA
            >>> api = PogoAPI()
            >>> name = 'stunfisk_galarian'
            >>> print(ub.repr2(api.get_pokemon_info(name)))
            >>> name = 'stunfisk'
            >>> print(ub.repr2(api.get_pokemon_info(name)))
            >>> name = 'umbreon'
            >>> print(ub.repr2(api.get_pokemon_info(name)))
            >>> name = 'eevee'
            >>> print(ub.repr2(api.get_pokemon_info(name)))
            >>> name = 'castform_snowy'
            >>> print(ub.repr2(api.get_pokemon_info(name)))

            >>> name = 'smeargle'
            >>> print(ub.repr2(api.get_pokemon_info(name)))

            >>> name = 'wormadam'
            >>> print(ub.repr2(api.get_pokemon_info(name)))

        """
        try:
            name_, form_ = api.normalize_name_and_form(name, form)
            form_ = form_.lower()
        except Exception:
            raise Exception('name={name}, form={form}'.format(**locals()))

        try:
            infos = {
                'stats': api.name_to_stats[name_],
                'evolutions': api.name_to_evolutions[name_],
                'type': api.name_to_type[name_],
                'moves': api.name_to_moves[name_],
            }
        except Exception:
            if True:
                all_names = list(api.name_to_stats.keys())
                suggest_spelling_correction(name, all_names, top=10)
            raise Exception(
                'name={name}, form={form}, name_={name_}, form_={form_}'.
                format(**locals()))

        info = {}
        for info_type, all_infos in infos.items():
            part = None
            form_to_info = ub.group_items(all_infos,
                                          lambda _info: _info['form'].lower())
            if form_ in form_to_info:
                parts = form_to_info[form_]
            else:
                if info_type != 'evolutions':
                    print('info_type = {!r}'.format(info_type))
                    print('form_to_info = {}'.format(
                        ub.repr2(form_to_info, nl=1)))
                    import warnings
                    msg = 'Unable to find name={} form_={} form={}, info_type={}'.format(
                        name, form_, form, info_type)
                    print(msg)
                    warnings.warn(msg)
                    parts = ub.peek(form_to_info.values())
                else:
                    parts = None

            if parts is None:
                part = []
            else:
                if len(parts) != 1:
                    print('parts = {!r}'.format(parts))
                    raise Exception
                part = parts[0]
            info.update(part)

        if 1:
            # TODO: remove
            fast_moves = set()
            charge_moves = set()

            for move in info['fast_moves']:
                fast_moves.add(normalize(move))
            for move in info['elite_fast_moves']:
                fast_moves.add(normalize(move))
            for move in info['charged_moves']:
                charge_moves.add(normalize(move))
            for move in info['elite_charged_moves']:
                charge_moves.add(normalize(move))

            if form_ == 'normal':
                if info['form'] == 'Shadow':
                    charge_moves.add('FRUSTRATION')
                    charge_moves.add('RETURN')

            if name_ not in api.learnable:
                api.learnable[name_] = {}
            api.learnable[name_]['fast'] = sorted(fast_moves)
            api.learnable[name_]['charge'] = sorted(charge_moves)

        api.LEVEL_CAP = 51
        return info
Beispiel #19
0
def _lcs_iter_simple(full_seq1, full_seq2, open_to_close, node_affinity,
                     open_to_tok):
    """
    Converts _lcs_recursive to an iterative algorithm using a fairly
    straightforward method that effectivly simulates callstacks
    """
    all_decomp1 = generate_all_decompositions(full_seq1, open_to_close,
                                              open_to_tok)
    all_decomp2 = generate_all_decompositions(full_seq2, open_to_close,
                                              open_to_tok)

    args0 = (full_seq1, full_seq2)
    frame0 = args0
    stack = [frame0]

    _results = {}
    # Populate base cases
    empty1 = type(ub.peek(all_decomp1.keys()))()
    empty2 = type(ub.peek(all_decomp2.keys()))()
    best = (empty1, empty2)
    base_result = (0, best)
    for seq1 in all_decomp1.keys():
        key1 = seq1
        t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1]
        _results[(seq1, empty2)] = base_result
        _results[(head1, empty2)] = base_result
        _results[(tail1, empty2)] = base_result
        _results[(head_tail1, empty2)] = base_result

    for seq2 in all_decomp2.keys():
        key2 = seq2
        t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2]
        _results[(empty1, seq2)] = base_result
        _results[(empty1, head2)] = base_result
        _results[(empty1, tail2)] = base_result
        _results[(empty1, head_tail2)] = base_result

    del args0
    del frame0
    del empty1
    del empty2
    del best
    del base_result

    missing_frames = []
    while stack:
        key = stack.pop()
        if key not in _results:
            seq1, seq2 = key
            missing_frames.clear()

            # try:
            t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1]
            # except KeyError:
            #     a1, b1, head1, tail1 = balanced_decomp_unsafe(seq1, open_to_close)
            #     head_tail1 = head1 + tail1
            #     all_decomp1[seq1] = a1, b1, head1, tail1, head_tail1

            # try:
            t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2]
            # except KeyError:
            #     a2, b2, head2, tail2 = balanced_decomp_unsafe(seq2, open_to_close)
            #     head_tail2 = head2 + tail2
            #     all_decomp2[seq2] = a2, b2, head2, tail2, head_tail2

            # Case 2: The current edge in sequence1 is deleted
            try:
                try_key = (head_tail1, seq2)
                cand1 = _results[try_key]
            except KeyError:
                missing_frames.append(try_key)

            # Case 3: The current edge in sequence2 is deleted
            try:
                try_key = (seq1, head_tail2)
                cand2 = _results[try_key]
            except KeyError:
                missing_frames.append(try_key)

            # Case 1: The LCS involves this edge
            affinity = node_affinity(t1, t2)
            if affinity:
                try:
                    try_key = (head1, head2)
                    pval_h, new_heads = _results[try_key]
                except KeyError:
                    missing_frames.append(try_key)

                try:
                    try_key = (tail1, tail2)
                    pval_t, new_tails = _results[try_key]
                except KeyError:
                    missing_frames.append(try_key)

                if not missing_frames:
                    new_head1, new_head2 = new_heads
                    new_tail1, new_tail2 = new_tails

                    subseq1 = a1 + new_head1 + b1 + new_tail1
                    subseq2 = a2 + new_head2 + b2 + new_tail2

                    res3 = (subseq1, subseq2)
                    val3 = pval_h + pval_t + affinity
                    cand3 = (val3, res3)
            else:
                cand3 = (-1, None)

            if missing_frames:
                # We did not solve this frame yet
                stack.append(key)
                stack.extend(missing_frames)
                # stack.extend(missing_frames[::-1])
            else:
                # We solved the frame
                _results[key] = max(cand1, cand2, cand3)

    val, best = _results[key]
    found = (best, val)
    return found
Beispiel #20
0
    def likely_overlaps(cls, pfiles1, pfiles2, thresh=0.2, verbose=1):
        """
        This is similar to finding duplicates, but between two sets of files

        Example:
            >>> fpaths = _demodata_files(num_files=100, rng=0)
            >>> fpaths1 = fpaths[0::2]
            >>> fpaths2 = fpaths[1::2]
            >>> pfiles1 = [ProgressiveFile(f) for f in fpaths1]
            >>> pfiles2 = [ProgressiveFile(f) for f in fpaths2]
            >>> overlap, only1, only2 = ProgressiveFile.likely_overlaps(pfiles1, pfiles2)
            >>> print(len(overlaps))
            >>> print(len(only1))
            >>> print(len(only2))
        """
        final_groups = {}

        # Mark each set of files, so we only refine if a duplicate group
        # contains elements from multiple sets

        set1 = {id(p) for p in pfiles1}
        set2 = {id(p) for p in pfiles2}

        def _membership(p):
            partof = []
            pid = id(p)
            if pid in set1:
                partof.append(1)
            if pid in set2:
                partof.append(2)
            return partof

        pfiles = pfiles1 + pfiles2

        active_groups = [pfiles]
        mode = 'thread'
        max_workers = 6

        if isinstance(thresh, dict):
            frac_thresh = thresh.get('frac', None)
            byte_thresh = thresh.get('byte', None)
        else:
            frac_thresh = thresh
            byte_thresh = thresh

        while active_groups:
            group_sizes = list(map(len, active_groups))
            total_active = sum(group_sizes)
            print('Checking {} active groups with {} items'.format(
                len(active_groups), total_active))
            groups = ub.dict_union(
                *[ProgressiveFile.group_pfiles(g) for g in active_groups])

            # Mark all groups that need refinement
            refine_items = []
            next_groups = []
            for key, group in groups.items():
                membership = {m for p in group for m in _membership(p)}

                group_frac = key[3]
                group_byte = key[1]
                # Check if we have hashed enough of the file by fraction or
                # number of bytes.
                terms = []
                if frac_thresh is not None:
                    terms.append(group_frac >= frac_thresh)
                if byte_thresh is not None:
                    terms.append(group_byte >= byte_thresh)
                good_enough = any(terms) or len(terms) == 0

                if not good_enough and len(membership) > 1 and len(group) > 1:
                    next_groups.append(group)
                    needs_refine = [
                        item for item in group
                        if not item.complete_enough(frac_thresh=frac_thresh,
                                                    byte_thresh=byte_thresh)
                    ]
                    refine_items.extend(needs_refine)
                else:
                    # Any group that doesnt need refinment is added to the
                    # solution and will not appear in the next active group
                    final_groups[key] = group

            # Refine any item that needs it
            if len(refine_items):
                # TODO: if there are few enough items, just refine to the
                # threshold?
                ProgressiveFile.parallel_refine(refine_items,
                                                mode=mode,
                                                step_idx='next',
                                                max_workers=max_workers,
                                                verbose=verbose)

            # Continue refinement as long as there are active groups
            active_groups = next_groups

        only1 = {}
        only2 = {}
        overlap = {}
        for key, group in final_groups.items():
            membership = {m for p in group for m in _membership(p)}
            if len(membership) == 1:
                if ub.peek(membership) == 1:
                    only1[key] = group
                else:
                    only2[key] = group
            else:
                overlap[key] = group

        return overlap, only1, only2
Beispiel #21
0
    def encode(self, item, axis=0, mode=1):
        """
        Given a dictionary containing preloaded components of the network
        inputs, build a concatenated (fused) network representations of each
        input stream.

        Args:
            item (Dict[str, Tensor]): a batch item containing unfused parts.
                each key should be a single-stream (optionally early fused)
                channel key.
            axis (int, default=0): concatenation dimension

        Returns:
            Dict[str, Tensor]:
                mapping between input stream and its early fused tensor input.

        Example:
            >>> from kwcoco.channel_spec import *  # NOQA
            >>> import numpy as np
            >>> dims = (4, 4)
            >>> item = {
            >>>     'rgb': np.random.rand(3, *dims),
            >>>     'disparity': np.random.rand(1, *dims),
            >>>     'flowx': np.random.rand(1, *dims),
            >>>     'flowy': np.random.rand(1, *dims),
            >>> }
            >>> # Complex Case
            >>> self = ChannelSpec('rgb,disparity,rgb|disparity|flowx|flowy,flowx|flowy')
            >>> fused = self.encode(item)
            >>> input_shapes = ub.map_vals(lambda x: x.shape, fused)
            >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1)))
            >>> # Simpler case
            >>> self = ChannelSpec('rgb|disparity')
            >>> fused = self.encode(item)
            >>> input_shapes = ub.map_vals(lambda x: x.shape, fused)
            >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1)))

        Example:
            >>> # Case where we have to break up early fused data
            >>> import numpy as np
            >>> dims = (40, 40)
            >>> item = {
            >>>     'rgb|disparity': np.random.rand(4, *dims),
            >>>     'flowx': np.random.rand(1, *dims),
            >>>     'flowy': np.random.rand(1, *dims),
            >>> }
            >>> # Complex Case
            >>> self = ChannelSpec('rgb,disparity,rgb|disparity,rgb|disparity|flowx|flowy,flowx|flowy,flowx,disparity')
            >>> inputs = self.encode(item)
            >>> input_shapes = ub.map_vals(lambda x: x.shape, inputs)
            >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1)))

            >>> # xdoctest: +REQUIRES(--bench)
            >>> #self = ChannelSpec('rgb|disparity,flowx|flowy')
            >>> import timerit
            >>> ti = timerit.Timerit(100, bestof=10, verbose=2)
            >>> for timer in ti.reset('mode=simple'):
            >>>     with timer:
            >>>         inputs = self.encode(item, mode=0)
            >>> for timer in ti.reset('mode=minimize-concat'):
            >>>     with timer:
            >>>         inputs = self.encode(item, mode=1)

            import xdev
            _ = xdev.profile_now(self.encode)(item, mode=1)
        """
        import kwarray
        if len(item) == 0:
            raise ValueError('Cannot encode empty item')
        _impl = kwarray.ArrayAPI.coerce(ub.peek(item.values()))

        parsed = self.parse()
        # unique = self.unique()

        # TODO: This can be made much more efficient by determining if the
        # channels item can be directly translated to the result inputs. We
        # probably don't need to do the full decoding each and every time.

        if mode == 1:
            # Slightly more complex implementation that attempts to minimize
            # concat operations.
            item_keys = tuple(sorted(item.keys()))
            parsed_items = tuple(
                sorted([(k, tuple(v)) for k, v in parsed.items()]))
            new_fused_indices = _cached_single_fused_mapping(item_keys,
                                                             parsed_items,
                                                             axis=axis)

            fused = {}
            for key, idx_list in new_fused_indices.items():
                parts = [
                    item[item_key][item_sl] for item_key, item_sl in idx_list
                ]
                if len(parts) == 1:
                    fused[key] = parts[0]
                else:
                    fused[key] = _impl.cat(parts, axis=axis)
        elif mode == 0:
            # Simple implementation that always does the full break down of
            # item components.
            components = {}
            # Determine the layout of the channels in the input item
            key_specs = {key: ChannelSpec(key) for key in item.keys()}
            for key, spec in key_specs.items():
                decoded = spec.decode({key: item[key]}, axis=axis)
                for subkey, subval in decoded.items():
                    components[subkey] = subval

            fused = {}
            for key, parts in parsed.items():
                fused[key] = _impl.cat([components[k] for k in parts],
                                       axis=axis)
        else:
            raise KeyError(mode)

        return fused
Beispiel #22
0
 def code_list(self):
     parsed = self.parse()
     if len(parsed) > 1:
         raise Exception('Can only work on single-streams. '
                         'TODO make class for single streams')
     return ub.peek(parsed.values())
Beispiel #23
0
def _lcs_iter_prehash(full_seq1, full_seq2, open_to_close, node_affinity,
                      open_to_tok):
    """
    Version of the lcs iterative algorithm where we precompute hash values

    This is actually slower than the simple version
    """
    def decomp_info(seq, open_to_close):
        pop_open, pop_close, head, tail = balanced_decomp_unsafe(
            seq, open_to_close)
        head_tail = head + tail
        head_key = hash(head)
        tail_key = hash(tail)
        head_tail_key = hash(head_tail)
        tok = open_to_tok[pop_open[0]]
        a = pop_open
        b = pop_close
        info = (tok, seq, head, tail, head_tail, head_key, tail_key,
                head_tail_key, a, b)
        return info

    def gen_decomp_v2(seq, open_to_close):
        _genmemo = {}

        def _gen(seq):
            if seq:
                key = hash(seq)
                if key not in _genmemo:
                    info = decomp_info(seq, open_to_close)
                    head, tail, head_tail = info[2:5]
                    _genmemo[key] = info
                    yield (seq, _genmemo[key])
                    yield from _gen(head_tail)
                    yield from _gen(head)
                    yield from _gen(tail)

        all_decomp = dict(_gen(seq))
        return all_decomp

    all_decomp1 = gen_decomp_v2(full_seq1, open_to_close)
    all_decomp2 = gen_decomp_v2(full_seq2, open_to_close)

    key_decomp1 = {}
    key_decomp2 = {}
    _results = {}
    # Populate base cases
    empty1 = type(ub.peek(all_decomp1.keys()))()
    empty2 = type(ub.peek(all_decomp2.keys()))()
    empty1_key = hash(empty1)
    empty2_key = hash(empty2)
    best = (empty1, empty2)
    base_result = (0, best)
    for seq1, info1 in all_decomp1.items():
        seq1_key = hash(seq1)
        head1_key, tail1_key, head_tail1_key = all_decomp1[seq1][5:8]
        _results[(seq1_key, empty2_key)] = base_result
        _results[(head1_key, empty2_key)] = base_result
        _results[(tail1_key, empty2_key)] = base_result
        _results[(head_tail1_key, empty2_key)] = base_result
        key_decomp1[seq1_key] = info1

    for seq2, info2 in all_decomp2.items():
        seq2_key = hash(seq2)
        head2_key, tail2_key, head_tail2_key = all_decomp2[seq2][5:8]
        _results[(empty1_key, seq2_key)] = base_result
        _results[(empty1_key, head2_key)] = base_result
        _results[(empty1_key, tail2_key)] = base_result
        _results[(empty1_key, head_tail2_key)] = base_result
        key_decomp2[seq2_key] = info2

    full_seq1_key = hash(full_seq1)
    full_seq2_key = hash(full_seq2)
    key0 = (full_seq1_key, full_seq2_key)
    frame0 = key0, full_seq1, full_seq2
    stack = [frame0]
    missing_frames = []
    while stack:
        frame = stack.pop()
        key, seq1, seq2 = frame
        seq1_key, seq2_key = key
        if key not in _results:
            missing_frames.clear()

            try:
                info1 = key_decomp1[seq1_key]
            except KeyError:
                info1 = decomp_info(seq1, open_to_close)
                key_decomp1[seq1_key] = info1
            tok1, seq1, head1, tail1, head_tail1, head1_key, tail1_key, head_tail1_key, a1, b1 = info1

            try:
                info2 = key_decomp2[seq2_key]
            except KeyError:
                info2 = decomp_info(seq2, open_to_close)
                key_decomp2[seq2_key] = info2
            tok2, seq2, head2, tail2, head_tail2, head2_key, tail2_key, head_tail2_key, a2, b2 = info2

            affinity = node_affinity(tok1, tok2)

            # Case 2: The current edge in sequence1 is deleted
            try:
                try_key = (head_tail1_key, seq2_key)
                cand1 = _results[try_key]
            except KeyError:
                miss_frame = try_key, head_tail1, seq2
                missing_frames.append(miss_frame)

            # Case 3: The current edge in sequence2 is deleted
            try:
                try_key = (seq1_key, head_tail2_key)
                cand2 = _results[try_key]
            except KeyError:
                miss_frame = try_key, seq1, head_tail2
                missing_frames.append(miss_frame)

            # Case 1: The LCS involves this edge
            if affinity:
                try:
                    try_key = (head1_key, head2_key)
                    pval_h, new_heads = _results[try_key]
                except KeyError:
                    miss_frame = try_key, head1, head2
                    missing_frames.append(miss_frame)

                try:
                    try_key = (tail1_key, tail2_key)
                    pval_t, new_tails = _results[try_key]
                except KeyError:
                    miss_frame = try_key, tail1, tail2
                    missing_frames.append(miss_frame)

                if not missing_frames:
                    new_head1, new_head2 = new_heads
                    new_tail1, new_tail2 = new_tails

                    subseq1 = a1 + new_head1 + b1 + new_tail1
                    subseq2 = a2 + new_head2 + b2 + new_tail2

                    res3 = (subseq1, subseq2)
                    val3 = pval_h + pval_t + affinity
                    cand3 = (val3, res3)
            else:
                cand3 = (-1, None)

            if missing_frames:
                # We did not solve this frame yet
                stack.append(frame)
                stack.extend(missing_frames[::-1])
            else:
                # We solved the frame
                _results[key] = max(cand1, cand2, cand3)

    # The stack pop is our solution
    (val, best) = _results[key]
    found = (best, val)
    return found
Beispiel #24
0
def _lcs_iter_simple_alt2(full_seq1, full_seq2, open_to_close, node_affinity,
                          open_to_tok):
    """
    Depth first stack trajectory and replace try except statements with ifs
    """
    all_decomp1 = generate_all_decompositions(full_seq1, open_to_close,
                                              open_to_tok)
    all_decomp2 = generate_all_decompositions(full_seq2, open_to_close,
                                              open_to_tok)

    key0 = (full_seq1, full_seq2)
    frame0 = key0
    stack = [frame0]

    _results = {}
    # Populate base cases
    empty1 = type(ub.peek(all_decomp1.keys()))()
    empty2 = type(ub.peek(all_decomp2.keys()))()
    best = (empty1, empty2)
    base_result = (0, best)
    for seq1 in all_decomp1.keys():
        key1 = seq1
        t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1]
        _results[(seq1, empty2)] = base_result
        _results[(head1, empty2)] = base_result
        _results[(tail1, empty2)] = base_result
        _results[(head_tail1, empty2)] = base_result

    for seq2 in all_decomp2.keys():
        key2 = seq2
        t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2]
        _results[(empty1, seq2)] = base_result
        _results[(empty1, head2)] = base_result
        _results[(empty1, tail2)] = base_result
        _results[(empty1, head_tail2)] = base_result

    del frame0
    del empty1
    del empty2
    del best
    del base_result

    while stack:
        key = stack[-1]
        if key not in _results:
            seq1, seq2 = key

            t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1]
            t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2]

            # Case 2: The current edge in sequence1 is deleted
            try_key = (head_tail1, seq2)
            if try_key in _results:
                cand1 = _results[try_key]
            else:
                # stack.append(key)
                stack.append(try_key)
                continue

            # Case 3: The current edge in sequence2 is deleted
            try_key = (seq1, head_tail2)
            if try_key in _results:
                cand2 = _results[try_key]
            else:
                # stack.append(key)
                stack.append(try_key)
                continue

            # Case 1: The LCS involves this edge
            affinity = node_affinity(t1, t2)
            if affinity:
                try_key = (head1, head2)
                if try_key in _results:
                    pval_h, new_heads = _results[try_key]
                else:
                    # stack.append(key)
                    stack.append(try_key)
                    continue

                try_key = (tail1, tail2)
                if try_key in _results:
                    pval_t, new_tails = _results[try_key]
                else:
                    # stack.append(key)
                    stack.append(try_key)
                    continue

                new_head1, new_head2 = new_heads
                new_tail1, new_tail2 = new_tails

                subseq1 = a1 + new_head1 + b1 + new_tail1
                subseq2 = a2 + new_head2 + b2 + new_tail2

                res3 = (subseq1, subseq2)
                val3 = pval_h + pval_t + affinity
                cand3 = (val3, res3)
            else:
                cand3 = (-1, None)

            # We solved the frame
            _results[key] = max(cand1, cand2, cand3)
        stack.pop()

    val, best = _results[key0]
    found = (best, val)
    return found
Beispiel #25
0
def benchmark_hash_data():
    """
    CommandLine:
        python ~/code/ubelt/dev/bench_hash.py --convert=True --show
        python ~/code/ubelt/dev/bench_hash.py --convert=False --show
    """
    import ubelt as ub
    #ITEM = 'JUST A STRING' * 100
    ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4]
    HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3']
    scales = list(range(5, 13))
    results = ub.AutoDict()
    # Use json is faster or at least as fast it most cases
    # xxhash is also significantly faster than sha512
    convert = ub.argval('--convert', default='True').lower() == 'True'
    print('convert = {!r}'.format(convert))
    ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms')
    for s in ub.ProgIter(scales, desc='benchmark', verbose=3):
        N = 2**s
        print(' --- s={s}, N={N} --- '.format(s=s, N=N))
        data = [ITEM] * N
        for hasher in HASHERS:
            for timer in ti.reset(hasher):
                ub.hash_data(data, hasher=hasher, convert=convert)
            results[hasher].update({N: ti.mean()})
        col = {h: results[h][N] for h in HASHERS}
        sortx = ub.argsort(col)
        ranking = ub.dict_subset(col, sortx)
        print('walltime: ' + ub.repr2(ranking, precision=9, nl=0))
        best = next(iter(ranking))
        #pairs = list(ub.iter_window( 2))
        pairs = [(k, best) for k in ranking]
        ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs]
        nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs]
        relratios = ub.odict(zip(nicekeys, ratios))
        print('speedup: ' + ub.repr2(relratios, precision=4, nl=0))
    # xdoc +REQUIRES(--show)
    # import pytest
    # pytest.skip()
    import pandas as pd
    df = pd.DataFrame.from_dict(results)
    df.columns.name = 'hasher'
    df.index.name = 'N'
    ratios = df.copy().drop(columns=df.columns)
    for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]:
        ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2]
    print()
    print('Seconds per iteration')
    print(df.to_string(float_format='%.9f'))
    print()
    print('Ratios of seconds')
    print(ratios.to_string(float_format='%.2f'))
    print()
    print('Average Ratio (over all N)')
    print('convert = {!r}'.format(convert))
    print(ratios.mean().sort_values())
    if ub.argflag('--show'):
        import kwplot
        kwplot.autompl()
        xdata = sorted(ub.peek(results.values()).keys())
        ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results)
        kwplot.multi_plot(xdata,
                          ydata,
                          xlabel='N',
                          ylabel='seconds',
                          title='convert = {}'.format(convert))
        kwplot.show_if_requested()
Beispiel #26
0
def 数组_弹出(iterable):
    return ub.peek(iterable)
Beispiel #27
0
def hard_drive_failure_analysis():
    """
    References:
        https://www.backblaze.com/blog/backblaze-hard-drive-stats-q2-2020/

       https://f001.backblazeb2.com/file/Backblaze_Blog/Q2_2020_Drive_Stats_Chart_Data.zip
       https://f001.backblazeb2.com/file/Backblaze_Blog/Q2_2019_Drive_Stats_Chart_Data.zip

    """
    import ubelt as ub
    import random
    import time

    url_template = 'https://f001.backblazeb2.com/file/Backblaze_Blog/{}_{}_Drive_Stats_Chart_Data.zip'
    success_urls = []
    failed_urls = []
    got_fpaths = []
    for year in range(2017, 2021):
        for q in [1, 2, 3, 4]:
            try:
                url = url_template.format('Q' + str(q), year)
                print('url = {!r}'.format(url))
                # Play nice, don't crash their servers
                fpath = ub.grabdata(url)
                print('Got fpath = {!r}'.format(fpath))
                success_urls.append(url)
                got_fpaths.append(fpath)
                if 0:
                    # only need to do this the first time
                    time.sleep(1 + random.random())
            except Exception:
                print('Failed to grab url = {!r}'.format(url))
                failed_urls.append(url)
                pass

    got_fpaths = [
        '/home/joncrall/.cache/ubelt/Q3_2017_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q1_2018_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q2_2018_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q3_2018_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q1_2019_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q2_2019_Drive_Stats_Chart_Data.zip',
        '/home/joncrall/.cache/ubelt/Q2_2020_Drive_Stats_Chart_Data.zip'
    ]

    from torch_liberator.util.util_zip import zopen, split_archive
    split_archive(fpath)

    import zipfile

    import pandas as pd

    rates = []

    for fpath in got_fpaths:
        myzip = zipfile.ZipFile(fpath, 'r')
        name = ub.peek(
            [name for name in myzip.namelist() if not name.startswith('_')])
        internal_fpath = fpath + '/' + name

        internal_file = zopen(internal_fpath, mode='rb')
        table = pd.read_excel(internal_file)

        found = None

        class BreakException(Exception):
            pass

        try:
            for rx, row in table.iterrows():
                for cx, col in enumerate(row):
                    if isinstance(col, str):
                        col = col.replace('\n', '').replace(' ', '').lower()
                        print('col = {!r}'.format(col))
                        if col in {
                                'afr', 'annualizedfailurerate', 'failurerate'
                        }:
                            found = (rx, cx)
                            raise BreakException

        except BreakException:
            pass

        if found is None:
            raise Exception

        rx, cx = found
        print('table = {!r}'.format(table))

        final_rate = table.iloc[-1].iloc[cx]
        rates.append(final_rate)

        drive_fails = table.iloc[-1].iloc[-2]
        drive_days = table.iloc[-1].iloc[-3]
        drive_count = table.iloc[-1].iloc[-4]
        print('final_rate = {!r}'.format(final_rate))

    # Lets say just overall every year your HDD has a 1.45% chance of failing

    annualize_fail_rate = 0.0145
    """

    rate = expected # events in 1 time period

    P(k events in t timesteps) = exp(- rate * t) * ((rate * time) ** k) / k!


    The probability we wait more than t for an event is

    P(T > t) = exp(-rate * t)

    The probability that the even will happen before time t is:

    P(T <= t) = 1 - exp(-rate * t)
    """

    import scipy.stats
    import numpy as np
    # According to [1] There is a ~1.45% chance of a drive failing each year
    # .. [1] https://www.backblaze.com/blog/backblaze-hard-drive-stats-q2-2020/

    # We can model a Poisson distribution to ask some questions
    λ = 1.45 / 100  # probability of failure within a year
    y = 1  # number of years
    k = 1  # number of events (failures)

    def probabilities_for_y_years(y):
        ##
        ##
        # The PMF is the probability that exactly k failures occur in y years
        print('\nIn y={} years we can expect'.format(y))

        rv = scipy.stats.poisson(mu=λ * y)

        k = 1
        p_one_fail = rv.pmf(k)
        print('p_one_fail = {:.4f}%'.format(p_one_fail * 100))
        k = 2
        p_two_fail = rv.pmf(k)
        print('p_two_fail = {:.4f}%'.format(p_two_fail * 100))

        # The CDF(k) is the probability the k or fewer failures occur in y years.
        # So, the probability k or more events occur is 1 - CDF(k - 1)
        # k or fewer, so 1 - CDF is the probability more than k events occur
        k = 1
        p_atleast_one_fail = 1 - rv.cdf(k - 1)
        print('p_atleast_one_fail = {:.4f}%'.format(p_atleast_one_fail * 100))

        k = 2
        p_atleast_two_fail = 1 - rv.cdf(k - 1)
        print('p_atleast_two_fail = {:.4f}%'.format(p_atleast_two_fail * 100))

    probabilities_for_y_years(y=1)
    probabilities_for_y_years(y=5)
    probabilities_for_y_years(y=10)
    probabilities_for_y_years(y=15)

    ##
    ##
    # The PMF is the probability that exactly k failures occur in y years
    k = 1
    p_one_fail = rv.pmf(k)
    print('p_one_fail = {:.4f}%'.format(p_one_fail * 100))
    k = 2
    p_two_fail = rv.pmf(k)
    print('p_two_fail = {:.4f}%'.format(p_two_fail * 100))

    # The CDF(k) is the probability the k or fewer failures occur in y years.
    # So, the probability k or more events occur is 1 - CDF(k - 1)
    # k or fewer, so 1 - CDF is the probability more than k events occur
    k = 1
    p_atleast_one_fail = 1 - rv.cdf(k - 1)
    print('p_atleast_one_fail = {:.4f}%'.format(p_atleast_one_fail * 100))

    k = 2
    p_atleast_two_fail = 1 - rv.cdf(k - 1)
    print('p_atleast_two_fail = {:.4f}%'.format(p_atleast_two_fail * 100))

    # Probability k disks fail after y years
    k = 1
    p_one_fail = ((λ * y)**k) * np.exp(-λ * y) / (scipy.special.factorial(k))
    print('p_one_fail = {:.4f}%'.format(p_one_fail * 100))

    k = 2
    p_two_fail = ((λ * y)**k) * np.exp(-λ * y) / (scipy.special.factorial(k))
    print('p_two_fail = {:.4f}%'.format(p_two_fail * 100))
Beispiel #28
0
def convert_camvid_raw_to_coco(camvid_raw_info):
    """
    Converts the raw camvid format to an MSCOCO based format, ( which lets use
    use kwcoco's COCO backend).

    Example:
        >>> # xdoctest: +REQUIRES(--download)
        >>> camvid_raw_info = grab_raw_camvid()
        >>> # test with a reduced set of data
        >>> del camvid_raw_info['img_paths'][2:]
        >>> del camvid_raw_info['mask_paths'][2:]
        >>> dset = convert_camvid_raw_to_coco(camvid_raw_info)
        >>> # xdoctest: +REQUIRES(--show)
        >>> import kwplot
        >>> plt = kwplot.autoplt()
        >>> kwplot.figure(fnum=1, pnum=(1, 2, 1))
        >>> dset.show_image(gid=1)
        >>> kwplot.figure(fnum=1, pnum=(1, 2, 2))
        >>> dset.show_image(gid=2)
    """
    import re
    import kwimage
    import kwcoco
    print('Converting CamVid to MS-COCO format')

    dset_root, img_paths, label_path, mask_paths = ub.take(
        camvid_raw_info,
        'dset_root, img_paths, label_path, mask_paths'.split(', '))

    img_infos = {
        'img_fname': img_paths,
        'mask_fname': mask_paths,
    }
    keys = list(img_infos.keys())
    next_vals = list(zip(*img_infos.values()))
    image_items = [{k: v for k, v in zip(keys, vals)} for vals in next_vals]

    dataset = {
        'img_root': dset_root,
        'images': [],
        'categories': [],
        'annotations': [],
    }

    lines = ub.readfrom(label_path).split('\n')
    lines = [line for line in lines if line]
    for line in lines:
        color_text, name = re.split('\t+', line)
        r, g, b = map(int, color_text.split(' '))
        color = (r, g, b)

        # Parse the special camvid format
        cid = (r << 16) + (g << 8) + (b << 0)
        cat = {
            'id': cid,
            'name': name,
            'color': color,
        }
        dataset['categories'].append(cat)

    for gid, img_item in enumerate(image_items, start=1):
        img = {
            'id': gid,
            'file_name': img_item['img_fname'],
            # nonstandard image field
            'segmentation': img_item['mask_fname'],
        }
        dataset['images'].append(img)

    dset = kwcoco.CocoDataset(dataset)
    dset.rename_categories({'Void': 'background'})

    assert dset.name_to_cat['background']['id'] == 0
    dset.name_to_cat['background'].setdefault('alias', []).append('Void')

    if False:
        _define_camvid_class_hierarcy(dset)

    if 1:
        # TODO: Binarize CCs (and efficiently encode if possible)
        import numpy as np

        bad_info = []
        once = False

        # Add images
        dset.remove_annotations(list(dset.index.anns.keys()))
        for gid, img in ub.ProgIter(dset.imgs.items(),
                                    desc='parse label masks'):
            mask_fpath = join(dset_root, img['segmentation'])

            rgb_mask = kwimage.imread(mask_fpath, space='rgb')
            r, g, b = rgb_mask.T.astype(np.int64)
            cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T)

            cids = set(np.unique(cid_mask)) - {0}

            for cid in cids:
                if cid not in dset.cats:
                    if gid == 618:
                        # Handle a known issue with image 618
                        c_mask = (cid == cid_mask).astype(np.uint8)
                        total_bad = c_mask.sum()
                        if total_bad < 32:
                            if not once:
                                print(
                                    'gid 618 has a few known bad pixels, ignoring them'
                                )
                                once = True
                            continue
                        else:
                            raise Exception('more bad pixels than expected')
                    else:
                        raise Exception(
                            'UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid))

                    # bad_rgb = cid_to_rgb(cid)
                    # print('bad_rgb = {!r}'.format(bad_rgb))
                    # print('WARNING UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid))
                    # bad_info.append({
                    #     'gid': gid,
                    #     'cid': cid,
                    # })
                else:
                    ann = {
                        'category_id': cid,
                        'image_id': gid
                        # 'segmentation': mask.to_coco()
                    }
                    assert cid in dset.cats
                    c_mask = (cid == cid_mask).astype(np.uint8)
                    mask = kwimage.Mask(c_mask, 'c_mask')

                    box = kwimage.Boxes([mask.get_xywh()], 'xywh')
                    # box = mask.to_boxes()

                    ann['bbox'] = ub.peek(box.to_coco())
                    ann['segmentation'] = mask.to_coco()
                    dset.add_annotation(**ann)

        if 0:
            bad_cids = [i['cid'] for i in bad_info]
            print(sorted([c['color'] for c in dataset['categories']]))
            print(sorted(set([cid_to_rgb(i['cid']) for i in bad_info])))

            gid = 618
            img = dset.imgs[gid]
            mask_fpath = join(dset_root, img['segmentation'])
            rgb_mask = kwimage.imread(mask_fpath, space='rgb')
            r, g, b = rgb_mask.T.astype(np.int64)
            cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T)
            cid_hist = ub.dict_hist(cid_mask.ravel())

            bad_cid_hist = {}
            for cid in bad_cids:
                bad_cid_hist[cid] = cid_hist.pop(cid)

            import kwplot
            kwplot.autompl()
            kwplot.imshow(rgb_mask)

    if 0:
        import kwplot
        plt = kwplot.autoplt()
        plt.clf()
        dset.show_image(1)

        import xdev
        gid_list = list(dset.imgs)
        for gid in xdev.InteractiveIter(gid_list):
            dset.show_image(gid)
            xdev.InteractiveIter.draw()

    dset._build_index()
    dset._build_hashid()
    return dset
Beispiel #29
0
def _coerce_datasets(config):
    import netharn as nh
    import ndsampler
    import numpy as np
    from torchvision import transforms
    coco_datasets = nh.api.Datasets.coerce(config)
    print('coco_datasets = {}'.format(ub.repr2(coco_datasets, nl=1)))
    for tag, dset in coco_datasets.items():
        dset._build_hashid(hash_pixels=False)

    workdir = ub.ensuredir(ub.expandpath(config['workdir']))
    samplers = {
        tag: ndsampler.CocoSampler(dset, workdir=workdir, backend=config['sampler_backend'])
        for tag, dset in coco_datasets.items()
    }

    for tag, sampler in ub.ProgIter(list(samplers.items()), desc='prepare frames'):
        sampler.frames.prepare(workers=config['workers'])

    # TODO: basic ndsampler torch dataset, likely has to support the transforms
    # API, bleh.

    transform = transforms.Compose([
        transforms.Resize(config['input_dims']),
        transforms.CenterCrop(config['input_dims']),
        transforms.ToTensor(),
        transforms.Lambda(lambda x: x.mul(255))
    ])

    torch_datasets = {
        key: SamplerDataset(
            sapmler, transform=transform,
            # input_dims=config['input_dims'],
            # augmenter=config['augmenter'] if key == 'train' else None,
        )
        for key, sapmler in samplers.items()
    }
    # self = torch_dset = torch_datasets['train']

    if config['normalize_inputs']:
        # Get stats on the dataset (todo: turn off augmentation for this)
        import kwarray
        _dset = torch_datasets['train']
        stats_idxs = kwarray.shuffle(np.arange(len(_dset)), rng=0)[0:min(1000, len(_dset))]
        stats_subset = torch.utils.data.Subset(_dset, stats_idxs)

        cacher = ub.Cacher('dset_mean', cfgstr=_dset.input_id + 'v3')
        input_stats = cacher.tryload()

        from netharn.data.channel_spec import ChannelSpec
        channels = ChannelSpec.coerce(config['channels'])

        if input_stats is None:
            # Use parallel workers to load data faster
            from netharn.data.data_containers import container_collate
            from functools import partial
            collate_fn = partial(container_collate, num_devices=1)

            loader = torch.utils.data.DataLoader(
                stats_subset,
                collate_fn=collate_fn,
                num_workers=config['workers'],
                shuffle=True,
                batch_size=config['batch_size'])

            # Track moving average of each fused channel stream
            channel_stats = {key: nh.util.RunningStats()
                             for key in channels.keys()}
            assert len(channel_stats) == 1, (
                'only support one fused stream for now')
            for batch in ub.ProgIter(loader, desc='estimate mean/std'):
                if isinstance(batch, (tuple, list)):
                    inputs = {'rgb': batch[0]}  # make assumption
                else:
                    inputs = batch['inputs']

                for key, val in inputs.items():
                    try:
                        for part in val.numpy():
                            channel_stats[key].update(part)
                    except ValueError:  # final batch broadcast error
                        pass

            perchan_input_stats = {}
            for key, running in channel_stats.items():
                running = ub.peek(channel_stats.values())
                perchan_stats = running.simple(axis=(1, 2))
                perchan_input_stats[key] = {
                    'std': perchan_stats['mean'].round(3),
                    'mean': perchan_stats['std'].round(3),
                }

            input_stats = ub.peek(perchan_input_stats.values())
            cacher.save(input_stats)
    else:
        input_stats = {}

    torch_loaders = {
        tag: dset.make_loader(
            batch_size=config['batch_size'],
            num_batches=config['num_batches'],
            num_workers=config['workers'],
            shuffle=(tag == 'train'),
            balance=(config['balance'] if tag == 'train' else None),
            pin_memory=True)
        for tag, dset in torch_datasets.items()
    }

    dataset_info = {
        'torch_datasets': torch_datasets,
        'torch_loaders': torch_loaders,
        'input_stats': input_stats
    }
    return dataset_info
Beispiel #30
0
def _define_camvid_class_hierarcy(dset):
    # add extra supercategories
    # NOTE: life-conscious, and life-inanimate are disjoint in this
    # forumlation because we are restricted to a tree structure.  If
    # this changse, then we can try rencoding with multiple parents.
    extra_structure = {
        # Break down the image into things that are part of the system, and
        # things that aren't
        'background': 'root',
        'system': 'root',

        # The system is made up of environmental components and actor
        # components.
        'environment': 'system',
        'actor': 'system',

        # Break actors (things with complex movement) into subtypes
        'life-conscious': 'actor',
        'vehicle-land': 'actor',
        'actor-other': 'actor',

        # Break the environment (things with simple movement) info subtypes
        'life-inanimate': 'environment',
        'civil-structure': 'environment',
        'civil-notice': 'environment',
        'transport-way': 'environment',

        # Subclassify transport mediums
        'drive-way': 'transport-way',
        'walk-way': 'transport-way',
    }

    for child, parent in extra_structure.items():
        if child in dset.name_to_cat:
            dset.name_to_cat[child]['supercategory'] = parent
        else:
            dset.add_category(name=child, supercategory=parent)

    dset.name_to_cat['background']['supercategory'] = 'root'

    dset.name_to_cat['Sky']['supercategory'] = 'environment'

    dset.name_to_cat['Animal']['supercategory'] = 'life-conscious'
    dset.name_to_cat['Bicyclist']['supercategory'] = 'life-conscious'
    dset.name_to_cat['Pedestrian']['supercategory'] = 'life-conscious'
    dset.name_to_cat['Child']['supercategory'] = 'life-conscious'

    dset.name_to_cat['OtherMoving']['supercategory'] = 'actor-other'
    dset.name_to_cat['CartLuggagePram']['supercategory'] = 'actor-other'

    dset.name_to_cat['Car']['supercategory'] = 'vehicle-land'
    dset.name_to_cat['Train']['supercategory'] = 'vehicle-land'
    dset.name_to_cat['Truck_Bus']['supercategory'] = 'vehicle-land'
    dset.name_to_cat['SUVPickupTruck']['supercategory'] = 'vehicle-land'
    dset.name_to_cat['MotorcycleScooter']['supercategory'] = 'vehicle-land'

    dset.name_to_cat['VegetationMisc']['supercategory'] = 'life-inanimate'
    dset.name_to_cat['Tree']['supercategory'] = 'life-inanimate'

    dset.name_to_cat['Column_Pole']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Fence']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Wall']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Building']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Archway']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Bridge']['supercategory'] = 'civil-structure'
    dset.name_to_cat['Tunnel']['supercategory'] = 'civil-structure'

    dset.name_to_cat['TrafficCone']['supercategory'] = 'civil-notice'
    dset.name_to_cat['TrafficLight']['supercategory'] = 'civil-notice'
    dset.name_to_cat['LaneMkgsDriv']['supercategory'] = 'civil-notice'
    dset.name_to_cat['LaneMkgsNonDriv']['supercategory'] = 'civil-notice'
    dset.name_to_cat['SignSymbol']['supercategory'] = 'civil-notice'
    dset.name_to_cat['ParkingBlock']['supercategory'] = 'civil-notice'
    dset.name_to_cat['Misc_Text']['supercategory'] = 'civil-notice'

    dset.name_to_cat['Road']['supercategory'] = 'drive-way'
    dset.name_to_cat['RoadShoulder']['supercategory'] = 'drive-way'
    dset.name_to_cat['Sidewalk']['supercategory'] = 'walk-way'

    for cat in list(dset.cats.values()):
        parent = cat.get('supercategory', None)
        if parent is not None:
            if parent not in dset.name_to_cat:
                print('Missing parent = {!r}'.format(parent))
                dset.add_category(name=parent, supercategory=parent)

    if 0:
        graph = dset.category_graph()
        import graphid
        graphid.util.show_nx(graph)

    # Add in some hierarcy information
    if 0:
        for x in dset.name_to_cat:
            print(
                "dset.name_to_cat[{!r}]['supercategory'] = 'object'".format(x))

    if 0:
        example_cat_aids = []
        for cat in dset.cats.values():
            cname = cat['name']
            aids = dset.index.cid_to_aids[dset.name_to_cat[cname]['id']]
            if len(aids):
                aid = ub.peek(aids)
                example_cat_aids.append(aid)
            else:
                print('No examples of cat = {!r}'.format(cat))

        import xdev
        import kwplot
        kwplot.autompl()
        for aid in xdev.InteractiveIter(example_cat_aids):
            print('aid = {!r}'.format(aid))
            ann = dset.anns[aid]
            cat = dset.cats[ann['category_id']]
            print('cat = {!r}'.format(cat))
            dset.show_image(aid=aid)
            xdev.InteractiveIter.draw()

        if 0:
            cname = 'CartLuggagePram'
            cname = 'ParkingBlock'
            cname = 'LaneMkgsDriv'
            aids = dset.index.cid_to_aids[dset.name_to_cat[cname]['id']]
            if len(aids):
                aid = ub.peek(aids)
                print('aid = {!r}'.format(aid))
                ann = dset.anns[aid]
                cat = dset.cats[ann['category_id']]
                print('cat = {!r}'.format(cat))
                dset.show_image(aid=aid)