def test_multispectral_sql(): try: import sqlalchemy except Exception: import pytest pytest.skip() import numpy as np import kwcoco import ubelt as ub dset1 = kwcoco.CocoDataset.demo('vidshapes1-multispectral') dset2 = dset1.view_sql(force_rewrite=True) dset2.basic_stats() name = ub.peek(dset1.index.name_to_img) img_dict = dset2.index.name_to_img[name] assert img_dict['name'] == name # file_name = ub.peek(dset1.index.file_name_to_img) # img_dict = dset2.index.name_to_img[name] # assert img_dict['name'] == name img1 = dset1.load_image(1, channels='B1') img2 = dset2.load_image(1, channels='B1') assert np.all(img1 == img2)
def motherboard_info(): """ REQUIRES SUDO xdoctest -m ~/misc/notes/buildapc.py motherboard_info """ import re info = ub.cmd('sudo dmidecode -t 9') pcie_slots = [] chunks = info['out'].split('\n\n') for chunk in chunks: item = {} for line in chunk.split('\n'): # doesn't get all data correctly (e.g. characteristics) parts = re.split('\t*:', line, maxsplit=1) if len(parts) == 2: key, val = parts key = key.strip() val = val.strip() if key in item: raise KeyError(f'key={key} already exists') item[key] = val if item: item = ub.map_keys(slugify_key, item) pcie_slots.append(item) pcie_usage = ub.dict_hist(item['current_usage'] for item in pcie_slots) _varied = varied_values(pcie_slots, min_variations=0) _varied = ub.map_keys(slugify_key, _varied) unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1} varied = {k: v for k, v in _varied.items() if len(v) > 1} print(info['out'])
def difference(self, other): """ Set difference Example: >>> self = ChannelSpec('rgb|disparity,flowx|flowy') >>> other = ChannelSpec('rgb') >>> self.difference(other) >>> other = ChannelSpec('flowx') >>> self.difference(other) """ assert len(list(other.keys())) == 1, 'can take diff with one stream' other_norm = ub.oset(ub.peek(other.normalize().values())) self_norm = self.normalize() new_streams = [] for key, parts in self_norm.items(): new_parts = ub.oset(parts) - ub.oset(other_norm) # shrink the representation of a complex r|g|b to an alias if # possible. # TODO: make this more efficient for alias, alias_spec in self._known.items(): alias_parts = ub.oset(alias_spec.split('|')) index = subsequence_index(new_parts, alias_parts) if index is not None: oset_delitem(new_parts, index) oset_insert(new_parts, index.start, alias) new_stream = '|'.join(new_parts) new_streams.append(new_stream) new_spec = ','.join(new_streams) new = self.__class__(new_spec) return new
def forward(self, inputs): """ Args: inputs (Tensor | dict): Either the input images (as a regulary pytorch BxCxHxW Tensor) or a dictionary mapping input modalities to the input imges. Returns: Dict[str, Tensor]: model output wrapped in a dictionary so its clear what the return type is. In this case "energy" is class probabilities **before** softmax / normalization is applied. """ if isinstance(inputs, dict): # TODO: handle channel modalities later assert len(inputs) == 1, ( 'only support one fused stream: e.g. rgb for now ') im = ub.peek(inputs.values()) else: im = inputs im = self.input_norm(im) class_energy = self.model(im) outputs = { 'class_energy': class_energy, } return outputs
def parse_mscoco(): # Test that our implementation can handle the real mscoco data root = ub.expandpath('~/data/standard_datasets/mscoco/') fpath = join(root, 'annotations/instances_val2014.json') img_root = normpath(ub.ensuredir((root, 'images', 'val2014'))) # fpath = join(root, 'annotations/stuff_val2017.json') # img_root = normpath(ub.ensuredir((root, 'images', 'val2017'))) import ujson dataset = ujson.load(open(fpath, 'rb')) import ndsampler dset = ndsampler.CocoDataset(dataset) dset.img_root = img_root gid_iter = iter(dset.imgs.keys()) gid = ub.peek(gid_iter) for gid in ub.ProgIter(gid_iter): img = dset.imgs[gid] ub.grabdata(img['coco_url'], dpath=img_root, verbose=0) anns = [dset.anns[aid] for aid in dset.gid_to_aids[gid]] dset.show_image(gid=gid) ann = anns[0] segmentation = ann['segmentation'] from PIL import Image gpath = join(dset.img_root, img['file_name']) with Image.open(gpath) as pil_img: np_img = np.array(pil_img)
def _resolve(_types): if len(_types) == 1: return ub.peek(_types) else: if unions: return ' | '.join(sorted(_types)) else: return 'Any'
def rank_inventory(inventory): candidates = list(ub.flatten(list(pkmn.family(ancestors=False, node=True)) for pkmn in inventory)) groups = ub.group_items(candidates, key=lambda p: p.name) leages = { 'master': {'max_cp': float('inf')}, 'ultra': {'max_cp': 2500}, 'great': {'max_cp': 1500}, 'little': {'max_cp': 500}, } max_level = 45 # for XL candy # max_level = 40 # normal all_dfs = [] for name, group in groups.items(): print('\n\n------------\n\n') print('name = {!r}'.format(name)) for leage_name, leage_filters in leages.items(): max_cp = leage_filters['max_cp'] print('') print(' ========== ') print(' --- {} in {} --- '.format(name, leage_name)) not_eligible = [p for p in group if p.cp is not None and p.cp > max_cp] eligible = [p for p in group if p.cp is None or p.cp <= max_cp] print('not_eligible = {!r}'.format(not_eligible)) if len(eligible) > 0: first = ub.peek(eligible) have_ivs = eligible df = first.leage_rankings_for(have_ivs, max_cp=max_cp, max_level=max_level) all_dfs.append(df) else: print('none eligable') # Print out the best ranks for each set of IVS over all possible forms # (lets you know which ones can be transfered safely) iv_to_rank = ub.ddict(list) for df in all_dfs: if df is not None: df = df.set_index(['iva', 'ivd', 'ivs']) for iv, rank in zip(df.index, df['rank']): iv_to_rank[iv].append(rank) iv_to_best_rank = ub.map_vals(sorted, iv_to_rank) iv_to_best_rank = ub.sorted_vals(iv_to_best_rank) print('iv_to_best_rank = {}'.format(ub.repr2(iv_to_best_rank, nl=1, align=':')))
def parse_cpu_info(percore=False): """ Get a nice summary of CPU information Requirements: pip install python-slugify Ignore: cpu_info = parse_cpu_info() print(cpu_info['varied']['cpu_mhz']) print('cpu_info = {}'.format(ub.repr2(cpu_info, nl=3))) Notes: * lscpu """ # ALSO import cpuinfo cpu_info = cpuinfo.get_cpu_info() import re info = ub.cmd('cat /proc/cpuinfo') cpu_lines = info['out'].split('\n\n') cores = [] for lines in cpu_lines: core = {} for line in lines.split('\n'): parts = re.split('\t*:', line, maxsplit=1) if len(parts) == 2: key, val = parts key = key.strip() val = val.strip() if key in core: raise KeyError(f'key={key} already exists') core[key] = val if len(core): core = ub.map_keys(slugify_key, core) cores.append(core) _varied = varied_values(cores, min_variations=0) unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1} varied = {k: v for k, v in _varied.items() if len(v) > 1} cpu_info = { 'varied': varied, 'unvaried': unvaried, } if percore: cpu_info['cores'] = cores return cpu_info
def _broadcast_colors(color, num, img, colorspace): """ Determine if color applies a single color to all ``num`` items, or if it is a list of colors for each item. Return as a list of colors for each item. TODO: - [ ] add as classmethod of kwimage.Color Example: >>> img = (np.random.rand(512, 512, 3) * 255).astype(np.uint8) >>> colorspace = 'rgb' >>> color = color_str_list = ['red', 'green', 'blue'] >>> color_str = 'red' >>> num = 3 >>> print(_broadcast_colors(color_str_list, num, img, colorspace)) >>> print(_broadcast_colors(color_str, num, img, colorspace)) >>> colors_tuple_list = _broadcast_colors(color_str_list, num, img, colorspace) >>> print(_broadcast_colors(colors_tuple_list, num, img, colorspace)) >>> # >>> # FIXME: This case seems broken >>> colors_ndarray_list = np.array(_broadcast_colors(color_str_list, num, img, colorspace)) >>> print(_broadcast_colors(colors_ndarray_list, num, img, colorspace)) """ # Note there is an ambiguity when num=3 and color=[int, int, int] # that must be resolved by checking num channels in the image import kwimage import ubelt as ub import numbers needs_broadcast = True # assume the list wasnt given by default if ub.iterable(color): first = ub.peek(color) if len(color) == num: if len(color) <= 4 and isinstance(first, numbers.Number): # ambiguous case, interpret as a single broadcastable color needs_broadcast = True else: # This is the only case we dont need broadcast needs_broadcast = False if needs_broadcast: color = kwimage.Color(color)._forimage(img, colorspace) colors = [color] * num else: colors = [kwimage.Color(c)._forimage(img, colorspace) for c in color] return colors
def __init__(repo, **kwargs): repo.name = kwargs.pop('name', None) repo.dpath = kwargs.pop('dpath', None) repo.code_dpath = kwargs.pop('code_dpath', None) repo.remotes = kwargs.pop('remotes', None) repo.remote = kwargs.pop('remote', None) repo.branch = kwargs.pop('branch', 'master') repo._logged_lines = [] repo._logged_cmds = [] if repo.remote is None: if repo.remotes is None: raise ValueError('must specify some remote') else: if len(repo.remotes) > 1: raise ValueError('remotes are ambiguous, specify one') else: repo.remote = ub.peek(repo.remotes) else: if repo.remotes is None: _default_remote = 'origin' repo.remotes = {_default_remote: repo.remote} repo.remote = _default_remote repo.url = repo.remotes[repo.remote] if repo.name is None: suffix = repo.url.split('/')[-1] repo.name = suffix.split('.git')[0] if repo.dpath is None: repo.dpath = join(repo.code_dpath, repo.name) repo.pkg_dpath = join(repo.dpath, repo.name) for path_attr in ['dpath', 'code_dpath']: path = getattr(repo, path_attr) if path is not None: setattr(repo, path_attr, ub.expandpath(path)) repo.verbose = kwargs.pop('verbose', 3) if kwargs: raise ValueError('unknown kwargs = {}'.format(kwargs.keys())) repo._pygit = None
def from_coco(cls, data, dims=None): """ Accepts either new-style or old-style coco polygons """ if isinstance(data, list): if len(data) > 0: assert isinstance(ub.peek(data), numbers.Number) exterior = np.array(data).reshape(-1, 2) self = cls(exterior=exterior) else: self = cls(exterior=[]) elif isinstance(data, dict): assert 'exterior' in data self = cls(**data) else: raise TypeError(type(data)) return self
def __init__(self, arch='resnet50', classes=1000, channels='rgb', input_stats=None): super(ClfModel, self).__init__() import ndsampler if input_stats is None: input_stats = {} input_norm = nh.layers.InputNorm(**input_stats) self.classes = ndsampler.CategoryTree.coerce(classes) self.channels = ChannelSpec.coerce(channels) chann_norm = self.channels.normalize() assert len(chann_norm) == 1 in_channels = len(ub.peek(chann_norm.values())) num_classes = len(self.classes) if arch == 'resnet50': from torchvision import models model = models.resnet50() new_conv1 = torch.nn.Conv2d(in_channels, 64, kernel_size=7, stride=3, padding=3, bias=False) new_fc = torch.nn.Linear(2048, num_classes, bias=True) new_conv1.weight.data[:, 0: in_channels, :, :] = model.conv1.weight.data[ 0:, 0:in_channels, :, :] new_fc.weight.data[0:num_classes, :] = model.fc.weight.data[ 0:num_classes, :] new_fc.bias.data[0:num_classes] = model.fc.bias.data[0:num_classes] model.fc = new_fc model.conv1 = new_conv1 else: raise KeyError(arch) self.input_norm = input_norm self.model = model self.coder = ClfCoder(self.classes)
def module_version_infos(): """ References: https://packaging.python.org/guides/single-sourcing-package-version/ """ try: from importlib import metadata except ImportError: # Running on pre-3.8 Python; use importlib-metadata package import importlib_metadata as metadata import sys modnames = ['torch', 'cv2', 'netharn', 'PIL', 'numpy'] infos = [] for modname in modnames: info = {'name': modname} try: module = sys.modules[modname] version_0 = getattr(module, '__version__', None) except Exception: version_0 = None try: version_1 = metadata.version(modname) except Exception: version_1 = None possible_versions = {version_1, version_0} - {None} if len(possible_versions) == 1: info['version'] = ub.peek(possible_versions) else: info['possible_versions'] = possible_versions if modname == 'torch': info['torch.version.cuda'] = torch.version.cuda info['torch.cuda.is_available()'] = torch.cuda.is_available() infos.append(info) # The conda info step is too slow (3 seconds) from netharn.util.collect_env import get_env_info env_info = get_env_info()._asdict() info['__env__'] = env_info
def _kwiver_to_kwimage_detections(detected_objects): """ Convert vital detected object sets to kwimage.Detections Args: detected_objects (kwiver.vital.types.DetectedObjectSet) Returns: kwimage.Detections """ import ubelt as ub import kwimage boxes = [] scores = [] class_idxs = [] classes = [] if len(detected_objects) > 0: obj = ub.peek(detected_objects) classes = obj.type().all_class_names() for obj in detected_objects: box = obj.bounding_box() tlbr = [box.min_x(), box.min_y(), box.max_x(), box.max_y()] score = obj.confidence() cname = obj.type().get_most_likely_class() cidx = classes.index(cname) boxes.append(tlbr) scores.append(score) class_idxs.append(cidx) dets = kwimage.Detections( boxes=kwimage.Boxes(np.array(boxes), 'tlbr'), scores=np.array(scores), class_idxs=np.array(class_idxs), classes=classes, ) return dets
def _main_device_id_from_data(item): """ Get device ids of a model Example: >>> device_ids = _main_device_id_from_data(torch.randn(3)) >>> print('device_ids = {!r}'.format(device_ids)) >>> if torch.cuda.is_available(): >>> device_ids = _main_device_id_from_data(torch.randn(3).to('cuda')) >>> print('device_ids = {!r}'.format(device_ids)) >>> for i in range(torch.cuda.device_count()): >>> device_ids = _main_device_id_from_data(torch.randn(3).to(i)) >>> print('device_ids = {!r}'.format(device_ids)) """ if hasattr(item, 'device'): return item.device.index if hasattr(item, 'is_cuda'): if item.is_cuda: return item.get_device().index else: return None elif hasattr(item, 'state_dict'): devices = [item.device for item in item.state_dict().values()] _device_ids = set() for device in devices: if device.type == 'cuda': index = device.index or 0 _device_ids.add(index) else: _device_ids.add(None) try: _device_ids = sorted(_device_ids) except TypeError: raise Exception('cannot currently mix CPU and GPU') _device_id = ub.peek(_device_ids) return _device_id else: raise TypeError(type(item))
def show_sample(self): """ CommandLine: python ~/code/netharn/netharn/examples/ggr_matching.py RandomBalancedIBEISSample.show_sample --show Example: >>> import sys >>> sys.path.append('/home/joncrall/code/netharn/examples') >>> from ggr_matching import * >>> self = RandomBalancedIBEISSample.from_dbname('PZ_MTEST') >>> nh.util.autompl() >>> self.show_sample() >>> nh.util.show_if_requested() """ vis_dataloader = torch.utils.data.DataLoader(self, shuffle=True, batch_size=8) example_batch = ub.peek(vis_dataloader) concatenated = torch.cat((example_batch[0], example_batch[1]), 0) tensor = torchvision.utils.make_grid(concatenated) im = tensor.numpy().transpose(1, 2, 0) nh.util.imshow(im)
def benchmark_hash_file(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --show python ~/code/ubelt/dev/bench_hash.py --show """ import ubelt as ub import random # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) dpath = ub.ensuredir(ub.expandpath('$HOME/tmp')) rng = random.Random(0) # Create a pool of random chunks of data chunksize = int(2 ** 20) pool_size = 8 part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)] #ITEM = 'JUST A STRING' * 100 HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 10)) import os results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2 ** s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) # Write a big file size_pool = [N] fpath = _write_random_file(dpath, part_pool, size_pool, rng) megabytes = os.stat(fpath).st_size / (2 ** 20) print('megabytes = {!r}'.format(megabytes)) for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_file(fpath, hasher=hasher) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds') kwplot.show_if_requested()
def get_pokemon_info(api, name, form=None): """ Example: >>> from pypogo.pogo_api import * # NOQA >>> api = PogoAPI() >>> name = 'stunfisk_galarian' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'stunfisk' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'umbreon' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'eevee' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'castform_snowy' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'smeargle' >>> print(ub.repr2(api.get_pokemon_info(name))) >>> name = 'wormadam' >>> print(ub.repr2(api.get_pokemon_info(name))) """ try: name_, form_ = api.normalize_name_and_form(name, form) form_ = form_.lower() except Exception: raise Exception('name={name}, form={form}'.format(**locals())) try: infos = { 'stats': api.name_to_stats[name_], 'evolutions': api.name_to_evolutions[name_], 'type': api.name_to_type[name_], 'moves': api.name_to_moves[name_], } except Exception: if True: all_names = list(api.name_to_stats.keys()) suggest_spelling_correction(name, all_names, top=10) raise Exception( 'name={name}, form={form}, name_={name_}, form_={form_}'. format(**locals())) info = {} for info_type, all_infos in infos.items(): part = None form_to_info = ub.group_items(all_infos, lambda _info: _info['form'].lower()) if form_ in form_to_info: parts = form_to_info[form_] else: if info_type != 'evolutions': print('info_type = {!r}'.format(info_type)) print('form_to_info = {}'.format( ub.repr2(form_to_info, nl=1))) import warnings msg = 'Unable to find name={} form_={} form={}, info_type={}'.format( name, form_, form, info_type) print(msg) warnings.warn(msg) parts = ub.peek(form_to_info.values()) else: parts = None if parts is None: part = [] else: if len(parts) != 1: print('parts = {!r}'.format(parts)) raise Exception part = parts[0] info.update(part) if 1: # TODO: remove fast_moves = set() charge_moves = set() for move in info['fast_moves']: fast_moves.add(normalize(move)) for move in info['elite_fast_moves']: fast_moves.add(normalize(move)) for move in info['charged_moves']: charge_moves.add(normalize(move)) for move in info['elite_charged_moves']: charge_moves.add(normalize(move)) if form_ == 'normal': if info['form'] == 'Shadow': charge_moves.add('FRUSTRATION') charge_moves.add('RETURN') if name_ not in api.learnable: api.learnable[name_] = {} api.learnable[name_]['fast'] = sorted(fast_moves) api.learnable[name_]['charge'] = sorted(charge_moves) api.LEVEL_CAP = 51 return info
def _lcs_iter_simple(full_seq1, full_seq2, open_to_close, node_affinity, open_to_tok): """ Converts _lcs_recursive to an iterative algorithm using a fairly straightforward method that effectivly simulates callstacks """ all_decomp1 = generate_all_decompositions(full_seq1, open_to_close, open_to_tok) all_decomp2 = generate_all_decompositions(full_seq2, open_to_close, open_to_tok) args0 = (full_seq1, full_seq2) frame0 = args0 stack = [frame0] _results = {} # Populate base cases empty1 = type(ub.peek(all_decomp1.keys()))() empty2 = type(ub.peek(all_decomp2.keys()))() best = (empty1, empty2) base_result = (0, best) for seq1 in all_decomp1.keys(): key1 = seq1 t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] _results[(seq1, empty2)] = base_result _results[(head1, empty2)] = base_result _results[(tail1, empty2)] = base_result _results[(head_tail1, empty2)] = base_result for seq2 in all_decomp2.keys(): key2 = seq2 t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] _results[(empty1, seq2)] = base_result _results[(empty1, head2)] = base_result _results[(empty1, tail2)] = base_result _results[(empty1, head_tail2)] = base_result del args0 del frame0 del empty1 del empty2 del best del base_result missing_frames = [] while stack: key = stack.pop() if key not in _results: seq1, seq2 = key missing_frames.clear() # try: t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] # except KeyError: # a1, b1, head1, tail1 = balanced_decomp_unsafe(seq1, open_to_close) # head_tail1 = head1 + tail1 # all_decomp1[seq1] = a1, b1, head1, tail1, head_tail1 # try: t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] # except KeyError: # a2, b2, head2, tail2 = balanced_decomp_unsafe(seq2, open_to_close) # head_tail2 = head2 + tail2 # all_decomp2[seq2] = a2, b2, head2, tail2, head_tail2 # Case 2: The current edge in sequence1 is deleted try: try_key = (head_tail1, seq2) cand1 = _results[try_key] except KeyError: missing_frames.append(try_key) # Case 3: The current edge in sequence2 is deleted try: try_key = (seq1, head_tail2) cand2 = _results[try_key] except KeyError: missing_frames.append(try_key) # Case 1: The LCS involves this edge affinity = node_affinity(t1, t2) if affinity: try: try_key = (head1, head2) pval_h, new_heads = _results[try_key] except KeyError: missing_frames.append(try_key) try: try_key = (tail1, tail2) pval_t, new_tails = _results[try_key] except KeyError: missing_frames.append(try_key) if not missing_frames: new_head1, new_head2 = new_heads new_tail1, new_tail2 = new_tails subseq1 = a1 + new_head1 + b1 + new_tail1 subseq2 = a2 + new_head2 + b2 + new_tail2 res3 = (subseq1, subseq2) val3 = pval_h + pval_t + affinity cand3 = (val3, res3) else: cand3 = (-1, None) if missing_frames: # We did not solve this frame yet stack.append(key) stack.extend(missing_frames) # stack.extend(missing_frames[::-1]) else: # We solved the frame _results[key] = max(cand1, cand2, cand3) val, best = _results[key] found = (best, val) return found
def likely_overlaps(cls, pfiles1, pfiles2, thresh=0.2, verbose=1): """ This is similar to finding duplicates, but between two sets of files Example: >>> fpaths = _demodata_files(num_files=100, rng=0) >>> fpaths1 = fpaths[0::2] >>> fpaths2 = fpaths[1::2] >>> pfiles1 = [ProgressiveFile(f) for f in fpaths1] >>> pfiles2 = [ProgressiveFile(f) for f in fpaths2] >>> overlap, only1, only2 = ProgressiveFile.likely_overlaps(pfiles1, pfiles2) >>> print(len(overlaps)) >>> print(len(only1)) >>> print(len(only2)) """ final_groups = {} # Mark each set of files, so we only refine if a duplicate group # contains elements from multiple sets set1 = {id(p) for p in pfiles1} set2 = {id(p) for p in pfiles2} def _membership(p): partof = [] pid = id(p) if pid in set1: partof.append(1) if pid in set2: partof.append(2) return partof pfiles = pfiles1 + pfiles2 active_groups = [pfiles] mode = 'thread' max_workers = 6 if isinstance(thresh, dict): frac_thresh = thresh.get('frac', None) byte_thresh = thresh.get('byte', None) else: frac_thresh = thresh byte_thresh = thresh while active_groups: group_sizes = list(map(len, active_groups)) total_active = sum(group_sizes) print('Checking {} active groups with {} items'.format( len(active_groups), total_active)) groups = ub.dict_union( *[ProgressiveFile.group_pfiles(g) for g in active_groups]) # Mark all groups that need refinement refine_items = [] next_groups = [] for key, group in groups.items(): membership = {m for p in group for m in _membership(p)} group_frac = key[3] group_byte = key[1] # Check if we have hashed enough of the file by fraction or # number of bytes. terms = [] if frac_thresh is not None: terms.append(group_frac >= frac_thresh) if byte_thresh is not None: terms.append(group_byte >= byte_thresh) good_enough = any(terms) or len(terms) == 0 if not good_enough and len(membership) > 1 and len(group) > 1: next_groups.append(group) needs_refine = [ item for item in group if not item.complete_enough(frac_thresh=frac_thresh, byte_thresh=byte_thresh) ] refine_items.extend(needs_refine) else: # Any group that doesnt need refinment is added to the # solution and will not appear in the next active group final_groups[key] = group # Refine any item that needs it if len(refine_items): # TODO: if there are few enough items, just refine to the # threshold? ProgressiveFile.parallel_refine(refine_items, mode=mode, step_idx='next', max_workers=max_workers, verbose=verbose) # Continue refinement as long as there are active groups active_groups = next_groups only1 = {} only2 = {} overlap = {} for key, group in final_groups.items(): membership = {m for p in group for m in _membership(p)} if len(membership) == 1: if ub.peek(membership) == 1: only1[key] = group else: only2[key] = group else: overlap[key] = group return overlap, only1, only2
def encode(self, item, axis=0, mode=1): """ Given a dictionary containing preloaded components of the network inputs, build a concatenated (fused) network representations of each input stream. Args: item (Dict[str, Tensor]): a batch item containing unfused parts. each key should be a single-stream (optionally early fused) channel key. axis (int, default=0): concatenation dimension Returns: Dict[str, Tensor]: mapping between input stream and its early fused tensor input. Example: >>> from kwcoco.channel_spec import * # NOQA >>> import numpy as np >>> dims = (4, 4) >>> item = { >>> 'rgb': np.random.rand(3, *dims), >>> 'disparity': np.random.rand(1, *dims), >>> 'flowx': np.random.rand(1, *dims), >>> 'flowy': np.random.rand(1, *dims), >>> } >>> # Complex Case >>> self = ChannelSpec('rgb,disparity,rgb|disparity|flowx|flowy,flowx|flowy') >>> fused = self.encode(item) >>> input_shapes = ub.map_vals(lambda x: x.shape, fused) >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1))) >>> # Simpler case >>> self = ChannelSpec('rgb|disparity') >>> fused = self.encode(item) >>> input_shapes = ub.map_vals(lambda x: x.shape, fused) >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1))) Example: >>> # Case where we have to break up early fused data >>> import numpy as np >>> dims = (40, 40) >>> item = { >>> 'rgb|disparity': np.random.rand(4, *dims), >>> 'flowx': np.random.rand(1, *dims), >>> 'flowy': np.random.rand(1, *dims), >>> } >>> # Complex Case >>> self = ChannelSpec('rgb,disparity,rgb|disparity,rgb|disparity|flowx|flowy,flowx|flowy,flowx,disparity') >>> inputs = self.encode(item) >>> input_shapes = ub.map_vals(lambda x: x.shape, inputs) >>> print('input_shapes = {}'.format(ub.repr2(input_shapes, nl=1))) >>> # xdoctest: +REQUIRES(--bench) >>> #self = ChannelSpec('rgb|disparity,flowx|flowy') >>> import timerit >>> ti = timerit.Timerit(100, bestof=10, verbose=2) >>> for timer in ti.reset('mode=simple'): >>> with timer: >>> inputs = self.encode(item, mode=0) >>> for timer in ti.reset('mode=minimize-concat'): >>> with timer: >>> inputs = self.encode(item, mode=1) import xdev _ = xdev.profile_now(self.encode)(item, mode=1) """ import kwarray if len(item) == 0: raise ValueError('Cannot encode empty item') _impl = kwarray.ArrayAPI.coerce(ub.peek(item.values())) parsed = self.parse() # unique = self.unique() # TODO: This can be made much more efficient by determining if the # channels item can be directly translated to the result inputs. We # probably don't need to do the full decoding each and every time. if mode == 1: # Slightly more complex implementation that attempts to minimize # concat operations. item_keys = tuple(sorted(item.keys())) parsed_items = tuple( sorted([(k, tuple(v)) for k, v in parsed.items()])) new_fused_indices = _cached_single_fused_mapping(item_keys, parsed_items, axis=axis) fused = {} for key, idx_list in new_fused_indices.items(): parts = [ item[item_key][item_sl] for item_key, item_sl in idx_list ] if len(parts) == 1: fused[key] = parts[0] else: fused[key] = _impl.cat(parts, axis=axis) elif mode == 0: # Simple implementation that always does the full break down of # item components. components = {} # Determine the layout of the channels in the input item key_specs = {key: ChannelSpec(key) for key in item.keys()} for key, spec in key_specs.items(): decoded = spec.decode({key: item[key]}, axis=axis) for subkey, subval in decoded.items(): components[subkey] = subval fused = {} for key, parts in parsed.items(): fused[key] = _impl.cat([components[k] for k in parts], axis=axis) else: raise KeyError(mode) return fused
def code_list(self): parsed = self.parse() if len(parsed) > 1: raise Exception('Can only work on single-streams. ' 'TODO make class for single streams') return ub.peek(parsed.values())
def _lcs_iter_prehash(full_seq1, full_seq2, open_to_close, node_affinity, open_to_tok): """ Version of the lcs iterative algorithm where we precompute hash values This is actually slower than the simple version """ def decomp_info(seq, open_to_close): pop_open, pop_close, head, tail = balanced_decomp_unsafe( seq, open_to_close) head_tail = head + tail head_key = hash(head) tail_key = hash(tail) head_tail_key = hash(head_tail) tok = open_to_tok[pop_open[0]] a = pop_open b = pop_close info = (tok, seq, head, tail, head_tail, head_key, tail_key, head_tail_key, a, b) return info def gen_decomp_v2(seq, open_to_close): _genmemo = {} def _gen(seq): if seq: key = hash(seq) if key not in _genmemo: info = decomp_info(seq, open_to_close) head, tail, head_tail = info[2:5] _genmemo[key] = info yield (seq, _genmemo[key]) yield from _gen(head_tail) yield from _gen(head) yield from _gen(tail) all_decomp = dict(_gen(seq)) return all_decomp all_decomp1 = gen_decomp_v2(full_seq1, open_to_close) all_decomp2 = gen_decomp_v2(full_seq2, open_to_close) key_decomp1 = {} key_decomp2 = {} _results = {} # Populate base cases empty1 = type(ub.peek(all_decomp1.keys()))() empty2 = type(ub.peek(all_decomp2.keys()))() empty1_key = hash(empty1) empty2_key = hash(empty2) best = (empty1, empty2) base_result = (0, best) for seq1, info1 in all_decomp1.items(): seq1_key = hash(seq1) head1_key, tail1_key, head_tail1_key = all_decomp1[seq1][5:8] _results[(seq1_key, empty2_key)] = base_result _results[(head1_key, empty2_key)] = base_result _results[(tail1_key, empty2_key)] = base_result _results[(head_tail1_key, empty2_key)] = base_result key_decomp1[seq1_key] = info1 for seq2, info2 in all_decomp2.items(): seq2_key = hash(seq2) head2_key, tail2_key, head_tail2_key = all_decomp2[seq2][5:8] _results[(empty1_key, seq2_key)] = base_result _results[(empty1_key, head2_key)] = base_result _results[(empty1_key, tail2_key)] = base_result _results[(empty1_key, head_tail2_key)] = base_result key_decomp2[seq2_key] = info2 full_seq1_key = hash(full_seq1) full_seq2_key = hash(full_seq2) key0 = (full_seq1_key, full_seq2_key) frame0 = key0, full_seq1, full_seq2 stack = [frame0] missing_frames = [] while stack: frame = stack.pop() key, seq1, seq2 = frame seq1_key, seq2_key = key if key not in _results: missing_frames.clear() try: info1 = key_decomp1[seq1_key] except KeyError: info1 = decomp_info(seq1, open_to_close) key_decomp1[seq1_key] = info1 tok1, seq1, head1, tail1, head_tail1, head1_key, tail1_key, head_tail1_key, a1, b1 = info1 try: info2 = key_decomp2[seq2_key] except KeyError: info2 = decomp_info(seq2, open_to_close) key_decomp2[seq2_key] = info2 tok2, seq2, head2, tail2, head_tail2, head2_key, tail2_key, head_tail2_key, a2, b2 = info2 affinity = node_affinity(tok1, tok2) # Case 2: The current edge in sequence1 is deleted try: try_key = (head_tail1_key, seq2_key) cand1 = _results[try_key] except KeyError: miss_frame = try_key, head_tail1, seq2 missing_frames.append(miss_frame) # Case 3: The current edge in sequence2 is deleted try: try_key = (seq1_key, head_tail2_key) cand2 = _results[try_key] except KeyError: miss_frame = try_key, seq1, head_tail2 missing_frames.append(miss_frame) # Case 1: The LCS involves this edge if affinity: try: try_key = (head1_key, head2_key) pval_h, new_heads = _results[try_key] except KeyError: miss_frame = try_key, head1, head2 missing_frames.append(miss_frame) try: try_key = (tail1_key, tail2_key) pval_t, new_tails = _results[try_key] except KeyError: miss_frame = try_key, tail1, tail2 missing_frames.append(miss_frame) if not missing_frames: new_head1, new_head2 = new_heads new_tail1, new_tail2 = new_tails subseq1 = a1 + new_head1 + b1 + new_tail1 subseq2 = a2 + new_head2 + b2 + new_tail2 res3 = (subseq1, subseq2) val3 = pval_h + pval_t + affinity cand3 = (val3, res3) else: cand3 = (-1, None) if missing_frames: # We did not solve this frame yet stack.append(frame) stack.extend(missing_frames[::-1]) else: # We solved the frame _results[key] = max(cand1, cand2, cand3) # The stack pop is our solution (val, best) = _results[key] found = (best, val) return found
def _lcs_iter_simple_alt2(full_seq1, full_seq2, open_to_close, node_affinity, open_to_tok): """ Depth first stack trajectory and replace try except statements with ifs """ all_decomp1 = generate_all_decompositions(full_seq1, open_to_close, open_to_tok) all_decomp2 = generate_all_decompositions(full_seq2, open_to_close, open_to_tok) key0 = (full_seq1, full_seq2) frame0 = key0 stack = [frame0] _results = {} # Populate base cases empty1 = type(ub.peek(all_decomp1.keys()))() empty2 = type(ub.peek(all_decomp2.keys()))() best = (empty1, empty2) base_result = (0, best) for seq1 in all_decomp1.keys(): key1 = seq1 t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] _results[(seq1, empty2)] = base_result _results[(head1, empty2)] = base_result _results[(tail1, empty2)] = base_result _results[(head_tail1, empty2)] = base_result for seq2 in all_decomp2.keys(): key2 = seq2 t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] _results[(empty1, seq2)] = base_result _results[(empty1, head2)] = base_result _results[(empty1, tail2)] = base_result _results[(empty1, head_tail2)] = base_result del frame0 del empty1 del empty2 del best del base_result while stack: key = stack[-1] if key not in _results: seq1, seq2 = key t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] # Case 2: The current edge in sequence1 is deleted try_key = (head_tail1, seq2) if try_key in _results: cand1 = _results[try_key] else: # stack.append(key) stack.append(try_key) continue # Case 3: The current edge in sequence2 is deleted try_key = (seq1, head_tail2) if try_key in _results: cand2 = _results[try_key] else: # stack.append(key) stack.append(try_key) continue # Case 1: The LCS involves this edge affinity = node_affinity(t1, t2) if affinity: try_key = (head1, head2) if try_key in _results: pval_h, new_heads = _results[try_key] else: # stack.append(key) stack.append(try_key) continue try_key = (tail1, tail2) if try_key in _results: pval_t, new_tails = _results[try_key] else: # stack.append(key) stack.append(try_key) continue new_head1, new_head2 = new_heads new_tail1, new_tail2 = new_tails subseq1 = a1 + new_head1 + b1 + new_tail1 subseq2 = a2 + new_head2 + b2 + new_tail2 res3 = (subseq1, subseq2) val3 = pval_h + pval_t + affinity cand3 = (val3, res3) else: cand3 = (-1, None) # We solved the frame _results[key] = max(cand1, cand2, cand3) stack.pop() val, best = _results[key0] found = (best, val) return found
def benchmark_hash_data(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --convert=True --show python ~/code/ubelt/dev/bench_hash.py --convert=False --show """ import ubelt as ub #ITEM = 'JUST A STRING' * 100 ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4] HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 13)) results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 convert = ub.argval('--convert', default='True').lower() == 'True' print('convert = {!r}'.format(convert)) ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2**s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) data = [ITEM] * N for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_data(data, hasher=hasher, convert=convert) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print('convert = {!r}'.format(convert)) print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds', title='convert = {}'.format(convert)) kwplot.show_if_requested()
def 数组_弹出(iterable): return ub.peek(iterable)
def hard_drive_failure_analysis(): """ References: https://www.backblaze.com/blog/backblaze-hard-drive-stats-q2-2020/ https://f001.backblazeb2.com/file/Backblaze_Blog/Q2_2020_Drive_Stats_Chart_Data.zip https://f001.backblazeb2.com/file/Backblaze_Blog/Q2_2019_Drive_Stats_Chart_Data.zip """ import ubelt as ub import random import time url_template = 'https://f001.backblazeb2.com/file/Backblaze_Blog/{}_{}_Drive_Stats_Chart_Data.zip' success_urls = [] failed_urls = [] got_fpaths = [] for year in range(2017, 2021): for q in [1, 2, 3, 4]: try: url = url_template.format('Q' + str(q), year) print('url = {!r}'.format(url)) # Play nice, don't crash their servers fpath = ub.grabdata(url) print('Got fpath = {!r}'.format(fpath)) success_urls.append(url) got_fpaths.append(fpath) if 0: # only need to do this the first time time.sleep(1 + random.random()) except Exception: print('Failed to grab url = {!r}'.format(url)) failed_urls.append(url) pass got_fpaths = [ '/home/joncrall/.cache/ubelt/Q3_2017_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q1_2018_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q2_2018_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q3_2018_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q1_2019_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q2_2019_Drive_Stats_Chart_Data.zip', '/home/joncrall/.cache/ubelt/Q2_2020_Drive_Stats_Chart_Data.zip' ] from torch_liberator.util.util_zip import zopen, split_archive split_archive(fpath) import zipfile import pandas as pd rates = [] for fpath in got_fpaths: myzip = zipfile.ZipFile(fpath, 'r') name = ub.peek( [name for name in myzip.namelist() if not name.startswith('_')]) internal_fpath = fpath + '/' + name internal_file = zopen(internal_fpath, mode='rb') table = pd.read_excel(internal_file) found = None class BreakException(Exception): pass try: for rx, row in table.iterrows(): for cx, col in enumerate(row): if isinstance(col, str): col = col.replace('\n', '').replace(' ', '').lower() print('col = {!r}'.format(col)) if col in { 'afr', 'annualizedfailurerate', 'failurerate' }: found = (rx, cx) raise BreakException except BreakException: pass if found is None: raise Exception rx, cx = found print('table = {!r}'.format(table)) final_rate = table.iloc[-1].iloc[cx] rates.append(final_rate) drive_fails = table.iloc[-1].iloc[-2] drive_days = table.iloc[-1].iloc[-3] drive_count = table.iloc[-1].iloc[-4] print('final_rate = {!r}'.format(final_rate)) # Lets say just overall every year your HDD has a 1.45% chance of failing annualize_fail_rate = 0.0145 """ rate = expected # events in 1 time period P(k events in t timesteps) = exp(- rate * t) * ((rate * time) ** k) / k! The probability we wait more than t for an event is P(T > t) = exp(-rate * t) The probability that the even will happen before time t is: P(T <= t) = 1 - exp(-rate * t) """ import scipy.stats import numpy as np # According to [1] There is a ~1.45% chance of a drive failing each year # .. [1] https://www.backblaze.com/blog/backblaze-hard-drive-stats-q2-2020/ # We can model a Poisson distribution to ask some questions λ = 1.45 / 100 # probability of failure within a year y = 1 # number of years k = 1 # number of events (failures) def probabilities_for_y_years(y): ## ## # The PMF is the probability that exactly k failures occur in y years print('\nIn y={} years we can expect'.format(y)) rv = scipy.stats.poisson(mu=λ * y) k = 1 p_one_fail = rv.pmf(k) print('p_one_fail = {:.4f}%'.format(p_one_fail * 100)) k = 2 p_two_fail = rv.pmf(k) print('p_two_fail = {:.4f}%'.format(p_two_fail * 100)) # The CDF(k) is the probability the k or fewer failures occur in y years. # So, the probability k or more events occur is 1 - CDF(k - 1) # k or fewer, so 1 - CDF is the probability more than k events occur k = 1 p_atleast_one_fail = 1 - rv.cdf(k - 1) print('p_atleast_one_fail = {:.4f}%'.format(p_atleast_one_fail * 100)) k = 2 p_atleast_two_fail = 1 - rv.cdf(k - 1) print('p_atleast_two_fail = {:.4f}%'.format(p_atleast_two_fail * 100)) probabilities_for_y_years(y=1) probabilities_for_y_years(y=5) probabilities_for_y_years(y=10) probabilities_for_y_years(y=15) ## ## # The PMF is the probability that exactly k failures occur in y years k = 1 p_one_fail = rv.pmf(k) print('p_one_fail = {:.4f}%'.format(p_one_fail * 100)) k = 2 p_two_fail = rv.pmf(k) print('p_two_fail = {:.4f}%'.format(p_two_fail * 100)) # The CDF(k) is the probability the k or fewer failures occur in y years. # So, the probability k or more events occur is 1 - CDF(k - 1) # k or fewer, so 1 - CDF is the probability more than k events occur k = 1 p_atleast_one_fail = 1 - rv.cdf(k - 1) print('p_atleast_one_fail = {:.4f}%'.format(p_atleast_one_fail * 100)) k = 2 p_atleast_two_fail = 1 - rv.cdf(k - 1) print('p_atleast_two_fail = {:.4f}%'.format(p_atleast_two_fail * 100)) # Probability k disks fail after y years k = 1 p_one_fail = ((λ * y)**k) * np.exp(-λ * y) / (scipy.special.factorial(k)) print('p_one_fail = {:.4f}%'.format(p_one_fail * 100)) k = 2 p_two_fail = ((λ * y)**k) * np.exp(-λ * y) / (scipy.special.factorial(k)) print('p_two_fail = {:.4f}%'.format(p_two_fail * 100))
def convert_camvid_raw_to_coco(camvid_raw_info): """ Converts the raw camvid format to an MSCOCO based format, ( which lets use use kwcoco's COCO backend). Example: >>> # xdoctest: +REQUIRES(--download) >>> camvid_raw_info = grab_raw_camvid() >>> # test with a reduced set of data >>> del camvid_raw_info['img_paths'][2:] >>> del camvid_raw_info['mask_paths'][2:] >>> dset = convert_camvid_raw_to_coco(camvid_raw_info) >>> # xdoctest: +REQUIRES(--show) >>> import kwplot >>> plt = kwplot.autoplt() >>> kwplot.figure(fnum=1, pnum=(1, 2, 1)) >>> dset.show_image(gid=1) >>> kwplot.figure(fnum=1, pnum=(1, 2, 2)) >>> dset.show_image(gid=2) """ import re import kwimage import kwcoco print('Converting CamVid to MS-COCO format') dset_root, img_paths, label_path, mask_paths = ub.take( camvid_raw_info, 'dset_root, img_paths, label_path, mask_paths'.split(', ')) img_infos = { 'img_fname': img_paths, 'mask_fname': mask_paths, } keys = list(img_infos.keys()) next_vals = list(zip(*img_infos.values())) image_items = [{k: v for k, v in zip(keys, vals)} for vals in next_vals] dataset = { 'img_root': dset_root, 'images': [], 'categories': [], 'annotations': [], } lines = ub.readfrom(label_path).split('\n') lines = [line for line in lines if line] for line in lines: color_text, name = re.split('\t+', line) r, g, b = map(int, color_text.split(' ')) color = (r, g, b) # Parse the special camvid format cid = (r << 16) + (g << 8) + (b << 0) cat = { 'id': cid, 'name': name, 'color': color, } dataset['categories'].append(cat) for gid, img_item in enumerate(image_items, start=1): img = { 'id': gid, 'file_name': img_item['img_fname'], # nonstandard image field 'segmentation': img_item['mask_fname'], } dataset['images'].append(img) dset = kwcoco.CocoDataset(dataset) dset.rename_categories({'Void': 'background'}) assert dset.name_to_cat['background']['id'] == 0 dset.name_to_cat['background'].setdefault('alias', []).append('Void') if False: _define_camvid_class_hierarcy(dset) if 1: # TODO: Binarize CCs (and efficiently encode if possible) import numpy as np bad_info = [] once = False # Add images dset.remove_annotations(list(dset.index.anns.keys())) for gid, img in ub.ProgIter(dset.imgs.items(), desc='parse label masks'): mask_fpath = join(dset_root, img['segmentation']) rgb_mask = kwimage.imread(mask_fpath, space='rgb') r, g, b = rgb_mask.T.astype(np.int64) cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T) cids = set(np.unique(cid_mask)) - {0} for cid in cids: if cid not in dset.cats: if gid == 618: # Handle a known issue with image 618 c_mask = (cid == cid_mask).astype(np.uint8) total_bad = c_mask.sum() if total_bad < 32: if not once: print( 'gid 618 has a few known bad pixels, ignoring them' ) once = True continue else: raise Exception('more bad pixels than expected') else: raise Exception( 'UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid)) # bad_rgb = cid_to_rgb(cid) # print('bad_rgb = {!r}'.format(bad_rgb)) # print('WARNING UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid)) # bad_info.append({ # 'gid': gid, # 'cid': cid, # }) else: ann = { 'category_id': cid, 'image_id': gid # 'segmentation': mask.to_coco() } assert cid in dset.cats c_mask = (cid == cid_mask).astype(np.uint8) mask = kwimage.Mask(c_mask, 'c_mask') box = kwimage.Boxes([mask.get_xywh()], 'xywh') # box = mask.to_boxes() ann['bbox'] = ub.peek(box.to_coco()) ann['segmentation'] = mask.to_coco() dset.add_annotation(**ann) if 0: bad_cids = [i['cid'] for i in bad_info] print(sorted([c['color'] for c in dataset['categories']])) print(sorted(set([cid_to_rgb(i['cid']) for i in bad_info]))) gid = 618 img = dset.imgs[gid] mask_fpath = join(dset_root, img['segmentation']) rgb_mask = kwimage.imread(mask_fpath, space='rgb') r, g, b = rgb_mask.T.astype(np.int64) cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T) cid_hist = ub.dict_hist(cid_mask.ravel()) bad_cid_hist = {} for cid in bad_cids: bad_cid_hist[cid] = cid_hist.pop(cid) import kwplot kwplot.autompl() kwplot.imshow(rgb_mask) if 0: import kwplot plt = kwplot.autoplt() plt.clf() dset.show_image(1) import xdev gid_list = list(dset.imgs) for gid in xdev.InteractiveIter(gid_list): dset.show_image(gid) xdev.InteractiveIter.draw() dset._build_index() dset._build_hashid() return dset
def _coerce_datasets(config): import netharn as nh import ndsampler import numpy as np from torchvision import transforms coco_datasets = nh.api.Datasets.coerce(config) print('coco_datasets = {}'.format(ub.repr2(coco_datasets, nl=1))) for tag, dset in coco_datasets.items(): dset._build_hashid(hash_pixels=False) workdir = ub.ensuredir(ub.expandpath(config['workdir'])) samplers = { tag: ndsampler.CocoSampler(dset, workdir=workdir, backend=config['sampler_backend']) for tag, dset in coco_datasets.items() } for tag, sampler in ub.ProgIter(list(samplers.items()), desc='prepare frames'): sampler.frames.prepare(workers=config['workers']) # TODO: basic ndsampler torch dataset, likely has to support the transforms # API, bleh. transform = transforms.Compose([ transforms.Resize(config['input_dims']), transforms.CenterCrop(config['input_dims']), transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) torch_datasets = { key: SamplerDataset( sapmler, transform=transform, # input_dims=config['input_dims'], # augmenter=config['augmenter'] if key == 'train' else None, ) for key, sapmler in samplers.items() } # self = torch_dset = torch_datasets['train'] if config['normalize_inputs']: # Get stats on the dataset (todo: turn off augmentation for this) import kwarray _dset = torch_datasets['train'] stats_idxs = kwarray.shuffle(np.arange(len(_dset)), rng=0)[0:min(1000, len(_dset))] stats_subset = torch.utils.data.Subset(_dset, stats_idxs) cacher = ub.Cacher('dset_mean', cfgstr=_dset.input_id + 'v3') input_stats = cacher.tryload() from netharn.data.channel_spec import ChannelSpec channels = ChannelSpec.coerce(config['channels']) if input_stats is None: # Use parallel workers to load data faster from netharn.data.data_containers import container_collate from functools import partial collate_fn = partial(container_collate, num_devices=1) loader = torch.utils.data.DataLoader( stats_subset, collate_fn=collate_fn, num_workers=config['workers'], shuffle=True, batch_size=config['batch_size']) # Track moving average of each fused channel stream channel_stats = {key: nh.util.RunningStats() for key in channels.keys()} assert len(channel_stats) == 1, ( 'only support one fused stream for now') for batch in ub.ProgIter(loader, desc='estimate mean/std'): if isinstance(batch, (tuple, list)): inputs = {'rgb': batch[0]} # make assumption else: inputs = batch['inputs'] for key, val in inputs.items(): try: for part in val.numpy(): channel_stats[key].update(part) except ValueError: # final batch broadcast error pass perchan_input_stats = {} for key, running in channel_stats.items(): running = ub.peek(channel_stats.values()) perchan_stats = running.simple(axis=(1, 2)) perchan_input_stats[key] = { 'std': perchan_stats['mean'].round(3), 'mean': perchan_stats['std'].round(3), } input_stats = ub.peek(perchan_input_stats.values()) cacher.save(input_stats) else: input_stats = {} torch_loaders = { tag: dset.make_loader( batch_size=config['batch_size'], num_batches=config['num_batches'], num_workers=config['workers'], shuffle=(tag == 'train'), balance=(config['balance'] if tag == 'train' else None), pin_memory=True) for tag, dset in torch_datasets.items() } dataset_info = { 'torch_datasets': torch_datasets, 'torch_loaders': torch_loaders, 'input_stats': input_stats } return dataset_info
def _define_camvid_class_hierarcy(dset): # add extra supercategories # NOTE: life-conscious, and life-inanimate are disjoint in this # forumlation because we are restricted to a tree structure. If # this changse, then we can try rencoding with multiple parents. extra_structure = { # Break down the image into things that are part of the system, and # things that aren't 'background': 'root', 'system': 'root', # The system is made up of environmental components and actor # components. 'environment': 'system', 'actor': 'system', # Break actors (things with complex movement) into subtypes 'life-conscious': 'actor', 'vehicle-land': 'actor', 'actor-other': 'actor', # Break the environment (things with simple movement) info subtypes 'life-inanimate': 'environment', 'civil-structure': 'environment', 'civil-notice': 'environment', 'transport-way': 'environment', # Subclassify transport mediums 'drive-way': 'transport-way', 'walk-way': 'transport-way', } for child, parent in extra_structure.items(): if child in dset.name_to_cat: dset.name_to_cat[child]['supercategory'] = parent else: dset.add_category(name=child, supercategory=parent) dset.name_to_cat['background']['supercategory'] = 'root' dset.name_to_cat['Sky']['supercategory'] = 'environment' dset.name_to_cat['Animal']['supercategory'] = 'life-conscious' dset.name_to_cat['Bicyclist']['supercategory'] = 'life-conscious' dset.name_to_cat['Pedestrian']['supercategory'] = 'life-conscious' dset.name_to_cat['Child']['supercategory'] = 'life-conscious' dset.name_to_cat['OtherMoving']['supercategory'] = 'actor-other' dset.name_to_cat['CartLuggagePram']['supercategory'] = 'actor-other' dset.name_to_cat['Car']['supercategory'] = 'vehicle-land' dset.name_to_cat['Train']['supercategory'] = 'vehicle-land' dset.name_to_cat['Truck_Bus']['supercategory'] = 'vehicle-land' dset.name_to_cat['SUVPickupTruck']['supercategory'] = 'vehicle-land' dset.name_to_cat['MotorcycleScooter']['supercategory'] = 'vehicle-land' dset.name_to_cat['VegetationMisc']['supercategory'] = 'life-inanimate' dset.name_to_cat['Tree']['supercategory'] = 'life-inanimate' dset.name_to_cat['Column_Pole']['supercategory'] = 'civil-structure' dset.name_to_cat['Fence']['supercategory'] = 'civil-structure' dset.name_to_cat['Wall']['supercategory'] = 'civil-structure' dset.name_to_cat['Building']['supercategory'] = 'civil-structure' dset.name_to_cat['Archway']['supercategory'] = 'civil-structure' dset.name_to_cat['Bridge']['supercategory'] = 'civil-structure' dset.name_to_cat['Tunnel']['supercategory'] = 'civil-structure' dset.name_to_cat['TrafficCone']['supercategory'] = 'civil-notice' dset.name_to_cat['TrafficLight']['supercategory'] = 'civil-notice' dset.name_to_cat['LaneMkgsDriv']['supercategory'] = 'civil-notice' dset.name_to_cat['LaneMkgsNonDriv']['supercategory'] = 'civil-notice' dset.name_to_cat['SignSymbol']['supercategory'] = 'civil-notice' dset.name_to_cat['ParkingBlock']['supercategory'] = 'civil-notice' dset.name_to_cat['Misc_Text']['supercategory'] = 'civil-notice' dset.name_to_cat['Road']['supercategory'] = 'drive-way' dset.name_to_cat['RoadShoulder']['supercategory'] = 'drive-way' dset.name_to_cat['Sidewalk']['supercategory'] = 'walk-way' for cat in list(dset.cats.values()): parent = cat.get('supercategory', None) if parent is not None: if parent not in dset.name_to_cat: print('Missing parent = {!r}'.format(parent)) dset.add_category(name=parent, supercategory=parent) if 0: graph = dset.category_graph() import graphid graphid.util.show_nx(graph) # Add in some hierarcy information if 0: for x in dset.name_to_cat: print( "dset.name_to_cat[{!r}]['supercategory'] = 'object'".format(x)) if 0: example_cat_aids = [] for cat in dset.cats.values(): cname = cat['name'] aids = dset.index.cid_to_aids[dset.name_to_cat[cname]['id']] if len(aids): aid = ub.peek(aids) example_cat_aids.append(aid) else: print('No examples of cat = {!r}'.format(cat)) import xdev import kwplot kwplot.autompl() for aid in xdev.InteractiveIter(example_cat_aids): print('aid = {!r}'.format(aid)) ann = dset.anns[aid] cat = dset.cats[ann['category_id']] print('cat = {!r}'.format(cat)) dset.show_image(aid=aid) xdev.InteractiveIter.draw() if 0: cname = 'CartLuggagePram' cname = 'ParkingBlock' cname = 'LaneMkgsDriv' aids = dset.index.cid_to_aids[dset.name_to_cat[cname]['id']] if len(aids): aid = ub.peek(aids) print('aid = {!r}'.format(aid)) ann = dset.anns[aid] cat = dset.cats[ann['category_id']] print('cat = {!r}'.format(cat)) dset.show_image(aid=aid)