def test_group_items_sorted(): pairs = [ ('ham', 'protein'), ('jam', 'fruit'), ('spam', 'protein'), ('eggs', 'protein'), ('cheese', 'dairy'), ('banana', 'fruit'), ] item_list, groupid_list = zip(*pairs) result1 = ub.group_items(item_list, groupid_list, sorted_=False) result2 = ub.group_items(item_list, groupid_list, sorted_=True) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2
def setup_coco_datasets(): """ TODO: - [ ] Read arbitrary coco datasets here - [ ] Do proper train / validation split - [ ] Allow custom train / validation split """ from netharn.data.grab_camvid import grab_coco_camvid, grab_camvid_train_test_val_splits coco_dset = grab_coco_camvid() # Use the same train/test/vali splits used in segnet gid_subsets = grab_camvid_train_test_val_splits(coco_dset, mode='segnet') print(ub.map_vals(len, gid_subsets)) # gid_subsets.pop('test') # all_gids = list(coco_dset.imgs.keys()) # gid_subsets = { # 'train': all_gids[0:-100], # 'vali': all_gids[-100:], # } coco_datasets = { tag: coco_dset.subset(gids) for tag, gids in gid_subsets.items() } print('coco_datasets = {}'.format(ub.repr2(coco_datasets))) for tag, dset in coco_datasets.items(): dset._build_hashid(hash_pixels=False) return coco_datasets
def fix_conference_places(bibman): pubman = constants_tex_fixes.PubManager() needed = set() for entry in bibman.cleaned.values(): if entry['pub_type'] == 'conference': accro, year = (entry['pub_accro'], entry['year']) pub = pubman.find(accro) if pub.places is None or int(year) not in pub.places: needed.add((accro, year)) else: place = pub.places[int(year)] print('place = {!r}'.format(place)) entry['address'] = place if needed: needed = list(needed) used_years = ub.group_items(needed, ut.take_column(needed, 0)) for k, v in list(used_years.items()): used_years[k] = sorted(v) sortby = ub.map_vals(lambda vs: (len(vs), max(e[1] for e in vs)), used_years) used_years = ut.order_dict_by(used_years, ub.argsort(sortby)) print('NEED CONFERENCE LOCATIONS') print(ub.repr2(used_years, nl=2))
def __init__(self, index_to_label, batch_size=1, num_batches='auto', quantile=0.5, shuffle=False, rng=None): import kwarray rng = kwarray.ensure_rng(rng, api='python') label_to_indices = kwarray.group_items(np.arange(len(index_to_label)), index_to_label) label_to_freq = ub.map_vals(len, label_to_indices) label_to_subsampler = { label: RingSampler(indices, shuffle=shuffle, rng=rng) for label, indices in label_to_indices.items() } self.label_to_freq = label_to_freq self.index_to_label = index_to_label self.batch_size = batch_size self.shuffle = shuffle self.rng = rng self.label_to_indices = label_to_indices self.label_to_subsampler = label_to_subsampler if num_batches == 'auto': self.num_batches = self._auto_num_batches(quantile) else: self.num_batches = num_batches self.labels = list(self.label_to_indices.keys())
def test_group_items_callable(): pairs = [ ('ham', 'protein'), ('jam', 'fruit'), ('spam', 'protein'), ('eggs', 'protein'), ('cheese', 'dairy'), ('banana', 'fruit'), ] items, groupids = zip(*pairs) lut = dict(zip(items, groupids)) result1 = ub.group_items(items, groupids) result2 = ub.group_items(items, lut.__getitem__) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2
def demo_mwe_issue_dict_parallel(): config = DemoConfig() parser = config.argparse() config.update(parser.parse_known_args()[0].__dict__) self = DemoModel() inputs = { 'rgb': torch.rand(2, 3, 3, 5), 'aux': torch.rand(2, 1, 3, 5), } xpu = nh.XPU.coerce(config['xpu']) inputs = xpu.move(inputs) model = xpu.mount(self) print('xpu = {!r}'.format(xpu)) print('model = {!r}'.format(model)) print('inputs.Tshape = ' + ub.repr2(ub.map_vals(lambda x: x.shape, inputs), nl=1)) outputs = model(inputs) print('outputs.Tshape = ' + ub.repr2(ub.map_vals(lambda x: x.shape, outputs), nl=1))
def test_group_items_sorted_mixed_types(): import random groupid_list = [ 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, '1', '2', '3', '1', '2', '3', '1', '2', '3', '1', '2', '3', ] item_list = list(range(len(groupid_list))) # Randomize the order random.Random(947043).shuffle(groupid_list) random.Random(947043).shuffle(item_list) result1 = ub.group_items(item_list, groupid_list, sorted_=True) result2 = ub.group_items(item_list, groupid_list, sorted_=False) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2 assert '1' in result1 assert 1 in result1
def bench_closures(): """ Is it faster to use a closure or pass in the variables explicitly? """ import ubelt as ub import timerit import numpy as np # Test a nested func with vs without a closure def rand_complex(*shape): real = np.random.rand(*shape).astype(np.complex) imag = np.random.rand(*shape).astype(np.complex) * 1j mat = real + imag return mat s = int(ub.argval('--s', default='1')) mat1 = rand_complex(s, s) mat2 = rand_complex(s, s) N = 1000 offset = 100 def nested_closure(): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset def nested_explicit(mat1, mat2, N, offset): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset ti = timerit.Timerit(int(2**11), bestof=int(2**8), verbose=int(ub.argval('--verbose', default='1'))) for timer in ti.reset('nested_explicit'): with timer: nested_explicit(mat1, mat2, N, offset) for timer in ti.reset('nested_closure'): with timer: nested_closure() print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def __init__(verif, infr): verif.rng = np.random.RandomState(4033913) verif.dummy_params = { NEGTV: {'mean': .2, 'std': .25}, POSTV: {'mean': .85, 'std': .2}, INCMP: {'mean': .15, 'std': .1}, } verif.infr = infr verif.orig_nodes = set(infr.aids) verif.orig_labels = infr.get_node_attrs('orig_name_label') verif.orig_groups = ub.invert_dict(verif.orig_labels, False) verif.orig_groups = ub.map_vals(set, verif.orig_groups)
def color_nodes(graph, labelattr='label', brightness=.878, outof=None, sat_adjust=None): """ Colors edges and nodes by nid """ node_to_lbl = nx.get_node_attributes(graph, labelattr) unique_lbls = sorted(set(node_to_lbl.values())) ncolors = len(unique_lbls) if outof is None: if (ncolors) == 1: unique_colors = [util.Color('lightblue').as01()] elif (ncolors) == 2: # https://matplotlib.org/examples/color/named_colors.html unique_colors = ['royalblue', 'orange'] unique_colors = [util.Color(c).as01('bgr') for c in unique_colors] else: unique_colors = util.distinct_colors(ncolors, brightness=brightness) else: unique_colors = util.distinct_colors(outof, brightness=brightness) if sat_adjust: unique_colors = [ util.Color(c).adjust_hsv(0.0, sat_adjust, 0.0) for c in unique_colors ] # Find edges and aids strictly between two nids if outof is None: lbl_to_color = ub.dzip(unique_lbls, unique_colors) else: gray = util.Color('lightgray').as01('bgr') unique_colors = [gray] + unique_colors offset = max(1, min(unique_lbls)) - 1 node_to_lbl = ub.map_vals(lambda nid: max(0, nid - offset), node_to_lbl) lbl_to_color = ub.dzip(range(outof + 1), unique_colors) node_to_color = ub.map_vals(lbl_to_color, node_to_lbl) nx.set_node_attributes(graph, name='color', values=node_to_color) nx_ensure_agraph_color(graph)
def rank_inventory(inventory): candidates = list(ub.flatten(list(pkmn.family(ancestors=False, node=True)) for pkmn in inventory)) groups = ub.group_items(candidates, key=lambda p: p.name) leages = { 'master': {'max_cp': float('inf')}, 'ultra': {'max_cp': 2500}, 'great': {'max_cp': 1500}, 'little': {'max_cp': 500}, } max_level = 45 # for XL candy # max_level = 40 # normal all_dfs = [] for name, group in groups.items(): print('\n\n------------\n\n') print('name = {!r}'.format(name)) for leage_name, leage_filters in leages.items(): max_cp = leage_filters['max_cp'] print('') print(' ========== ') print(' --- {} in {} --- '.format(name, leage_name)) not_eligible = [p for p in group if p.cp is not None and p.cp > max_cp] eligible = [p for p in group if p.cp is None or p.cp <= max_cp] print('not_eligible = {!r}'.format(not_eligible)) if len(eligible) > 0: first = ub.peek(eligible) have_ivs = eligible df = first.leage_rankings_for(have_ivs, max_cp=max_cp, max_level=max_level) all_dfs.append(df) else: print('none eligable') # Print out the best ranks for each set of IVS over all possible forms # (lets you know which ones can be transfered safely) iv_to_rank = ub.ddict(list) for df in all_dfs: if df is not None: df = df.set_index(['iva', 'ivd', 'ivs']) for iv, rank in zip(df.index, df['rank']): iv_to_rank[iv].append(rank) iv_to_best_rank = ub.map_vals(sorted, iv_to_rank) iv_to_best_rank = ub.sorted_vals(iv_to_best_rank) print('iv_to_best_rank = {}'.format(ub.repr2(iv_to_best_rank, nl=1, align=':')))
def images_with_keypoints(): keypoint_gids = set() for aid, ann in merged.anns.items(): if ann['roi_shape'] == 'keypoints': keypoint_gids.add(ann['image_id']) relevant = ub.dict_subset(merged.gid_to_aids, keypoint_gids) relevant = { gid: [a for a in aids if merged.anns[a]['roi_shape'] == 'keypoints'] for gid, aids in relevant.items() } gid_list = ub.argsort(ub.map_vals(len, relevant))[::-1] return gid_list
def main(): import ubelt as ub from ubelt import util_list from ubelt.util_list import take import random from math import e # # Data N = 100 array = [random.random() for _ in range(N)] indices = [random.randint(0, N - 1) for _ in range(int(N // e))] ti = ub.Timerit(2 ** 11, bestof=2 ** 8, verbose=1) for timer in ti.reset('take'): with timer: list(take(array, indices)) for timer in ti.reset('util_list.take'): with timer: list(util_list.take(array, indices)) for timer in ti.reset('ub.take'): with timer: list(ub.take(array, indices)) print('---') # import pandas as pd # df = pd.DataFrame(rankings) # print('df =\n{}'.format(df)) print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def category_annotation_frequency(self): """ Reports the number of annotations of each category Example: >>> dataset = demo_coco_data() >>> self = CocoDataset(dataset, tag='demo') >>> hist = self.category_annotation_frequency() >>> print(ub.repr2(hist)) { 'astroturf': 0, 'astronaut': 1, 'astronomer': 1, 'helmet': 1, 'rocket': 1, 'mouth': 2, 'star': 5, } """ catname_to_nannots = ub.map_keys(lambda x: self.cats[x]['name'], ub.map_vals(len, self.cid_to_aids)) catname_to_nannots = ub.odict( sorted(catname_to_nannots.items(), key=lambda kv: (kv[1], kv[0]))) return catname_to_nannots
def postprocess(self, output, inp_size, orig_sizes, conf_thresh=0.24, nms_thresh=0.5, max_per_image=300): """ Postprocess the raw network output into usable bounding boxes Args: aoff_pred (ndarray): [B, HxW, A, 4] anchor offsets in the format (sig(x), sig(y), exp(w), exp(h)) note: in the aoff format x and y are centers of the box and wh represenets multiples of the anchor w/h iou_pred (ndarray): [B, HxW, A, 1] predicted iou (is this the objectness score?) prob_pred (ndarray): [B, HxW, A, C] predicted class probability inp_size (tuple): size (W, H) of input to network orig_sizes (list): [B, 2] size (W, H) of each in image before rescale conf_thresh (float): threshold for filtering bboxes. Keep only the detections above this confidence value. nms_thresh (float): nonmax supression iou threshold Notes: Let B = batch_size Let A = num_anchors Let C = num_classes Let (H, W) = shape of the output grid Original params for nms_thresh (iou_thresh) and conf_thresh (thresh) are here: https://github.com/pjreddie/darknet/blob/master/examples/yolo.c#L213 On parameter settings: Remove the bounding boxes which have no object. Remove the bounding boxes that predict a confidence score less than a threshold of 0.24 https://towardsdatascience.com/training-object-detection-yolov2-from-scratch-using-cyclic-learning-rates-b3364f7e4755 Network Visualization: http://ethereon.github.io/netscope/#/gist/d08a41711e48cf111e330827b1279c31 CommandLine: python -m clab.models.yolo2.darknet Darknet19.postprocess --show Example: >>> from clab.models.yolo2.darknet import * >>> inp_size = (288, 288) >>> self = Darknet19(num_classes=20) >>> state_dict = torch.load(demo_weights())['model_state_dict'] >>> self.load_state_dict(state_dict) >>> im_data, rgb255 = demo_image(inp_size) >>> im_data = torch.cat([im_data, im_data]) # make a batch size of 2 >>> output = self(im_data) >>> # Define remaining params >>> orig_sizes = torch.LongTensor([rgb255.shape[0:2][::-1]] * len(im_data)) >>> conf_thresh = 0.01 >>> nms_thresh = 0.5 >>> postout = self.postprocess(output, inp_size, orig_sizes, conf_thresh, nms_thresh) >>> out_boxes, out_scores, out_cxs = postout >>> # xdoc: +REQUIRES(--show) >>> from clab.util import mplutil >>> mplutil.qtensure() # xdoc: +SKIP >>> label_names = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', >>> 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', >>> 'dog', 'horse', 'motorbike', 'person', >>> 'pottedplant', 'sheep', 'sofa', 'train', >>> 'tvmonitor') >>> import pandas as pd >>> cls_names = list(ub.take(label_names, out_cxs[0])) >>> print(pd.DataFrame({'name': cls_names, 'score': out_scores[0]})) >>> mplutil.figure(fnum=1, doclf=True) >>> mplutil.imshow(rgb255, colorspace='rgb') >>> mplutil.draw_boxes(out_boxes[0]) >>> mplutil.show_if_requested() """ aoff_pred_, iou_pred_, prob_pred_ = output # convert to numpy aoff_pred = aoff_pred_.data.cpu().numpy() iou_pred = iou_pred_.data.cpu().numpy() prob_pred = prob_pred_.data.cpu().numpy() orig_sizes = orig_sizes.data.cpu().numpy() # num_classes, num_anchors = cfg.num_classes, cfg.num_anchors num_classes = self.num_classes anchors = self.anchors out_size = np.array(inp_size) // 32 # hacked we know the factor is 32 W, H = out_size out_boxes = [] out_scores = [] out_cxs = [] # For each image in the batch, postprocess the predicted boxes for bx in range(aoff_pred.shape[0]): aoffs = aoff_pred[bx][None, :] ious = iou_pred[bx] probs = prob_pred[bx] orig_w, orig_h = orig_sizes[bx] # Convert anchored predictions to absolute tlbr bounding boxes in # normalized space aoffs = np.ascontiguousarray(aoffs, dtype=np.float) norm_boxes = yolo_utils.yolo_to_bbox(aoffs, anchors, H, W)[0] # Scale the bounding boxes to the size of the original image. # and convert to integer representation. boxes = norm_boxes.copy() boxes[..., 0::2] *= float(orig_w) boxes[..., 1::2] *= float(orig_h) boxes = boxes.astype(np.int) # converts [1, W * H, A, 4] -> [W * H * A, 4] boxes = np.reshape(boxes, [-1, 4]) ious = np.reshape(ious, [-1]) probs = np.reshape(probs, [-1, num_classes]) # Predict the class with maximum probability cls_inds = np.argmax(probs, axis=1) cls_probs = probs[(np.arange(probs.shape[0]), cls_inds)] """ Reference: arXiv:1506.02640 [cs.CV] (Yolo 1): Formally we define confidence as $Pr(Object) ∗ IOU^truth_pred$. If no object exists in that cell, the confidence scores should be zero. Otherwise we want the confidence score to equal the intersection over union (IOU) between the predicted box and the ground truth """ # Compute the final probabilities for the predicted class scores = ious * cls_probs # filter boxes based on confidence threshold keep_conf = np.where(scores >= conf_thresh) boxes = boxes[keep_conf] scores = scores[keep_conf] cls_inds = cls_inds[keep_conf] # nonmax supression (per-class) keep_flags = np.zeros(len(boxes), dtype=np.uint8) cx_to_inds = ub.group_items(range(len(cls_inds)), cls_inds) cx_to_inds = ub.map_vals(np.array, cx_to_inds) for cx, inds in cx_to_inds.items(): # get predictions for each class c_bboxes = boxes[inds] c_scores = scores[inds] c_keep = yolo_utils.nms_detections(c_bboxes, c_scores, nms_thresh) keep_flags[inds[c_keep]] = 1 keep_nms = np.where(keep_flags > 0) boxes = boxes[keep_nms] scores = scores[keep_nms] cls_inds = cls_inds[keep_nms] # clip boxes = yolo_utils.clip_boxes(boxes, im_shape=(orig_h, orig_w)) # sort boxes by descending score sortx = scores.argsort()[::-1] boxes = boxes[sortx] scores = scores[sortx] cls_inds = cls_inds[sortx] if max_per_image > 0 and len(boxes) > max_per_image: boxes = boxes[:max_per_image] scores = scores[:max_per_image] cls_inds = cls_inds[:max_per_image] out_boxes.append(boxes) out_scores.append(scores) out_cxs.append(cls_inds) postout = (out_boxes, out_scores, out_cxs) return postout
def 字典_根据值重建(func, dict_): data = ub.map_vals(func, dict_) return data
def benchmark_hash_data(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --convert=True --show python ~/code/ubelt/dev/bench_hash.py --convert=False --show """ import ubelt as ub #ITEM = 'JUST A STRING' * 100 ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4] HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 13)) results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 convert = ub.argval('--convert', default='True').lower() == 'True' print('convert = {!r}'.format(convert)) ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2**s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) data = [ITEM] * N for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_data(data, hasher=hasher, convert=convert) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print('convert = {!r}'.format(convert)) print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds', title='convert = {}'.format(convert)) kwplot.show_if_requested()
def naive_password_strategy(required_len=14, required_caps=1, required_special=1, required_digits=1): """ Simulate a "bad" password that meets typical password requirements Get a naive version of the N char min special char password One common strategy for getting a 14 char pass is using 2 words or a word and a date with misspellings, shuffled case, and a special char, which is probably _, -, ., !, or @ Example: scheme = naive_password_strategy() print(f'scheme={scheme}') """ # When people are forced to include a special character, this is the # liklihood they choose one of the following: # https://www.reddit.com/r/dataisbeautiful/comments/2vfgvh/most_frequentlyused_special_characters_in_10/ special_char_freq = { '_': 0.332, '.': 0.304, '-': 0.086, '!': 0.065, '@': 0.052, '*': 0.032, '$': 0.019, '&': 0.009, '%': 0.007, } _total = sum(special_char_freq.values()) special_char_prob = ub.map_vals(lambda x: x / _total, special_char_freq) # Only seach the most likely special chars naive_special_chars = { k: v for k, v in special_char_prob.items() if v > 0.05 } if 0: import diceware wlpath = diceware.wordlist.get_wordlist_path('en') wlpath = diceware.wordlist.get_wordlist_path('en_securedrop') wordlist = list(diceware.wordlist.WordList(wlpath)) word_lengths = sorted(map(len, wordlist)) word_length_hist = ub.dict_hist(word_lengths) else: # Number of common password words with a specific length word_length_hist = { 1: 10, 2: 90, 3: 582, 4: 2279, 5: 3350, 6: 1313, 7: 539, 8: 22, 9: 5, 10: 2 } # Also needs a number and special char required_word_len = required_len - 2 # How many permutations of N words are there that get over the char limit? total_passwords = 0 import itertools as it import functools import operator as op possible_num_word = [1, 2, 3] for num_words in possible_num_word: for ts in it.product(*[word_length_hist.items()] * num_words): ks = [k for k, v in ts] vs = [v for k, v in ts] # If the lengths are above, we can take any of these permutations # (with replacement) if sum(ks) > required_word_len: # Compute the number of phrases, then augment this with the # special properties. num_phrases = functools.reduce(op.mul, vs) # People might insert a special character at the start, middle, # or end, or predictably replace a letter. predictability_factor = 2 num_special_locs = (num_words + 1) * predictability_factor special_factor = required_special * len( naive_special_chars) * num_special_locs # People might insert a digit at start, middle, or end, or maybe # inside of a word replacing a common letter. num_digit_locs = num_words + 1 num_digits = 10 + 100 # usually a 1 or 2 digit number digit_factor = required_digits * num_digits * num_digit_locs # People might only shuffle the case of 1 or 2 letters. # usually at the beginning of words caps_factor = required_caps * num_words total = (num_phrases * (1 + special_factor) * (1 + caps_factor) * (1 + digit_factor)) total_passwords += total name_parts = ['naive', str(required_len)] if required_caps: name_parts.append('caps') if required_digits: name_parts.append('digit') if required_special: name_parts.append('special') name = '-'.join(name_parts) scheme = { 'name': name, 'num': 1, 'base': total_passwords, } return scheme
def load_partial_state(model, model_state_dict, leftover=None, ignore_unset=False, verbose=2, mangle=True, association=None, initializer=None): """ CommandLine: python -m netharn.initializers.nninit_base load_partial_state Args: model (torch.nn.Module): module to initialize model_state_dict (dict): state dict we wish to transfer leftover (callable): fallback method for initializing incompatible areas, if none then those areas are left as-is. association (str): controls how we search for the association between the two model states. Can be strict, module-hack, prefix-hack, or embedding. Default is: prefix-hack. mangle (bool, default=True): If True, mangles tensors that have the same key, but different shapes forcing them to fit. This might destroy information when forcing a a larger tensor into a smaller tensor, or leave extra uninitialized room when a small tensor is placed in a larger one. Note be careful when mangling a classification layer if class indexes are not aligned. verbose (int): verbosity level Returns: Dict: info - summary of actions taken TODO: - [ ] Allow user to specify how incompatible layers are handled. Notes: Have you ever had the scenario where Has anyone ever had a problem where you had a torch model with a state dict with keys that looked like: `mymodel.detector.layer1.conv.weight`, but you had a pretrained weight file with keys that looked like: `module.layer1.conv.weight`? The latest version of `netharn.initializers.functional.load_patial_state` can handle this by solving a maximum-common-subtree-isomorphism problem. This computes the largest possible mapping between the two state dictionaries that share consistent suffixes. >>> # This means you can load an off-the-shelf unmodified pretrained resnet50 >>> # where the keys might look something like this: >>> resnet_keys = { >>> 'conv1.weight', >>> 'layer1.0.conv1.weight', >>> 'layer1.0.conv2.weight', >>> 'layer1.0.conv3.weight', >>> 'layer1.0.downsample.0.weight', >>> 'layer2.0.conv1.weight', >>> 'layer2.0.conv2.weight', >>> 'layer2.0.conv3.weight', >>> 'layer3.0.conv1.weight', >>> 'layer4.0.conv1.weight', >>> 'fc.weight', >>> 'fc.bias', >>> } >>> # >>> # And perhaps you have a model that has a state dict where keys >>> # look like this: >>> model_keys = { >>> 'preproc.conv1.weight' >>> 'backbone.layer1.0.conv1.weight', >>> 'backbone.layer1.0.conv2.weight', >>> 'backbone.layer1.0.conv3.weight', >>> 'backbone.layer1.0.downsample.0.weight', >>> 'backbone.layer2.0.conv1.weight', >>> 'backbone.layer2.0.conv2.weight', >>> 'backbone.layer2.0.conv3.weight', >>> 'backbone.layer3.0.conv1.weight', >>> 'backbone.layer4.0.conv1.weight', >>> 'head.conv1' >>> 'head.conv2' >>> 'head.fc.weight' >>> 'head.fc.bias' >>> } >>> # >>> # We can compute a partial mapping between them >>> subpaths1, subpaths2 = maximum_common_ordered_subpaths(resnet_keys, model_keys) >>> print(ub.repr2(ub.dzip(subpaths1, subpaths2))) { 'layer1.0.conv2.weight': 'backbone.layer1.0.conv2.weight', 'layer1.0.conv3.weight': 'backbone.layer1.0.conv3.weight', 'layer1.0.downsample.0.weight': 'backbone.layer1.0.downsample.0.weight', 'layer2.0.conv1.weight': 'backbone.layer2.0.conv1.weight', 'layer2.0.conv2.weight': 'backbone.layer2.0.conv2.weight', 'layer2.0.conv3.weight': 'backbone.layer2.0.conv3.weight', 'layer3.0.conv1.weight': 'backbone.layer3.0.conv1.weight', 'layer4.0.conv1.weight': 'backbone.layer4.0.conv1.weight', } Also, if the sizes of the tensor don't quite fit, they will be mangled, i.e. "shoved-in" as best as possible. Example: >>> import netharn as nh >>> # --- >>> model_other = nh.models.ToyNet2d(input_channels=1, num_classes=10) >>> model_other.hack_param1 = torch.nn.Parameter(torch.rand(1)) >>> model_other.hack_param3 = torch.nn.Parameter(torch.rand(3)) >>> model_other.hack_param5 = torch.nn.Parameter(torch.rand(3)) >>> # --- >>> model_self = nh.models.ToyNet2d(input_channels=3, num_classes=2) >>> model_self.hack_param1 = torch.nn.Parameter(torch.rand(3)) >>> model_self.hack_param2 = torch.nn.Parameter(torch.rand(3)) >>> model_self.hack_param4 = torch.nn.Parameter(torch.rand(3)) >>> # --- >>> model_state_dict = model_other.state_dict() >>> load_partial_state(model_self, model_state_dict) >>> load_partial_state(model_self, model_state_dict, leftover=torch.nn.init.kaiming_normal_) >>> _ = load_partial_state(model_self, model_state_dict, leftover=torch.nn.init.kaiming_normal_, association='embedding') Example: >>> from netharn.initializers.functional import * # NOQA >>> import netharn as nh >>> xpu = nh.XPU(None) >>> self1 = nh.models.ToyNet2d() >>> self2 = xpu.mount(self1) >>> load_partial_state(self2, self1.state_dict()) >>> load_partial_state(self1, self2.state_dict()) >>> # Add extra nonsense to state-dict >>> extra_state_dict = {'extra.' + k: v for k, v in self1.state_dict().items()} >>> extra_state_dict['stats'] = ub.peek(extra_state_dict.values()).clone() >>> model = self2 >>> model_state_dict = extra_state_dict >>> load_partial_state(self2, extra_state_dict, association='embedding') Example: >>> # xdoctest: +REQUIRES(--slow) >>> from netharn.initializers.functional import * # NOQA >>> import torchvision >>> import torch >>> resnet50 = torchvision.models.resnet50() >>> class CustomModel(torch.nn.Module): >>> def __init__(self): >>> super().__init__() >>> self.module = resnet50 >>> self.extra = torch.nn.Linear(1, 1) >>> model = CustomModel() >>> model_state_dict = resnet50.state_dict() >>> model_state_dict2 = {'prefix.' + k: v for k, v in model_state_dict.items()} >>> import ubelt as ub >>> with ub.Timer(verbose=2, label='strict'): >>> load_partial_state(model, model_state_dict, association='strict', verbose=0) >>> with ub.Timer(verbose=2, label='prefix-hack'): >>> load_partial_state(model, model_state_dict, association='prefix-hack', verbose=0) >>> with ub.Timer(verbose=2, label='module-hack'): >>> load_partial_state(model, model_state_dict, association='module-hack', verbose=0) >>> with ub.Timer(verbose=2, label='embedding'): >>> load_partial_state(model, model_state_dict, association='embedding', verbose=0) >>> load_partial_state(model, model_state_dict, association='prefix-hack', verbose=1) >>> load_partial_state(model, model_state_dict, association='module-hack', verbose=1) Ignore: >>> from bioharn.models.new_models_v1 import * # NOQA >>> channels = ChannelSpec.coerce('rgb') >>> input_stats = None >>> self = MM_HRNetV2_w18_MaskRCNN(classes=3, channels=channels) >>> filename = self.pretrained_url >>> self._init_backbone_from_pretrained(self.pretrained_url) >>> from bioharn.models.mm_models import _load_mmcv_weights >>> model_state = _load_mmcv_weights(filename) >>> self.detector.backbone.chan_backbones.rgb >>> model = self >>> model_state_dict = model_state from netharn.initializers.functional import * # NOQA import xdev globals().update(**xdev.get_func_kwargs(load_partial_state)) CommandLine: xdoctest -m /home/joncrall/code/netharn/netharn/initializers/functional.py load_partial_state:2 --slow """ if association is None: association = 'module-hack' # old default # association = 'prefix-hack' # new default if initializer is not None: warnings.warn('initializer is deprecated use leftover') leftover = initializer self_state = model.state_dict() def _fix_keys(model_state_dict): """ Hack around DataParallel wrapper. If there is nothing in common between the two models check to see if prepending 'module.' to other keys fixes it. """ other_keys = set(model_state_dict) self_keys = set(self_state) if 0: # Automatic way to reduce nodes in the trees? # If node b always follows node a, can we contract it? nodes1 = [n for p in other_keys for n in p.split('.')] nodes2 = [n for p in self_keys for n in p.split('.')] tups1 = list(tup for key in other_keys for tup in ub.iter_window(key.split('.'), 2)) tups2 = list(tup for key in self_keys for tup in ub.iter_window(key.split('.'), 2)) x = ub.ddict(list) for a, b in tups1: x[a].append(b) for a, b in tups2: x[a].append(b) nodehist = ub.dict_hist(nodes1 + nodes2) for k, v in x.items(): print('----') print(k) print(nodehist[k]) follow_hist = ub.dict_hist(v) print(follow_hist) total = sum(follow_hist.values()) if ub.allsame(follow_hist.values()) and total == nodehist[k]: print('CONTRACT') # pair_freq = ub.dict_hist(ub.flatten([tups1, tups2])) # print(forest_str(paths_to_otree(other_keys, '.'))) # common_keys = other_keys.intersection(self_keys) # if not common_keys: if not other_keys.issubset(self_keys): if association == 'strict': pass elif association == 'module-hack': # If there are no common keys try a hack prefix = 'module.' def smap(f, ss): return set(map(f, ss)) def fix1(k): return prefix + k def fix2(k): if k.startswith(prefix): return k[len(prefix):] if smap(fix1, other_keys).intersection(self_keys): model_state_dict = ub.map_keys(fix1, model_state_dict) elif smap(fix2, other_keys).intersection(self_keys): model_state_dict = ub.map_keys(fix2, model_state_dict) elif association == 'prefix-hack': import functools def add_prefix(k, prefix): return prefix + k def remove_prefix(k, prefix): if k.startswith(prefix): return k[len(prefix):] # set1 = other_keys # target_set2 = self_keys found = _best_prefix_transform(other_keys, self_keys) if found is not None: for action, prefix in found['transform']: if action == 'add': func = functools.partial(add_prefix, prefix=prefix) elif action == 'remove': func = functools.partial(remove_prefix, prefix=prefix) else: raise AssertionError model_state_dict = ub.map_keys(func, model_state_dict) elif association in {'embedding', 'isomorphism'}: if verbose > 1: print('Using subpath {} association, may take some time'. format(association)) # I believe this is the correct way to solve the problem paths1 = sorted(other_keys) paths2 = sorted(self_state) if 1: # hack to filter to reduce tree size in embedding problem def shrink_paths(paths): new_paths = [] for p in paths: p = p.replace('.0', ':0') p = p.replace('.1', ':1') p = p.replace('.2', ':2') p = p.replace('.3', ':3') p = p.replace('.4', ':4') p = p.replace('.5', ':5') p = p.replace('.6', ':6') p = p.replace('.7', ':7') p = p.replace('.8', ':8') p = p.replace('.9', ':9') p = p.replace('.weight', ':weight') p = p.replace('.bias', ':bias') p = p.replace('.num_batches_tracked', ':num_batches_tracked') p = p.replace('.running_mean', ':running_mean') p = p.replace('.running_var', ':running_var') # p = p.replace('.conv1', ':conv1') # p = p.replace('.conv2', ':conv2') # p = p.replace('.conv3', ':conv3') # p = p.replace('.bn1', ':bn1') # p = p.replace('.bn2', ':bn2') # p = p.replace('.bn3', ':bn3') new_paths.append(p) return new_paths # Reducing the depth saves a lot of time paths1_ = shrink_paths(paths1) paths2_ = shrink_paths(paths2) subpaths1, subpaths2 = maximum_common_ordered_subpaths( paths1_, paths2_, sep='.', mode=association) subpaths1 = [p.replace(':', '.') for p in subpaths1] subpaths2 = [p.replace(':', '.') for p in subpaths2] mapping = ub.dzip(subpaths1, subpaths2) if verbose > 1: other_unmapped = sorted(other_keys - set(mapping.keys())) self_unmapped = sorted(self_keys - set(mapping.values())) print('-- embed association (other -> self) --') print('mapping = {}'.format(ub.repr2(mapping, nl=1))) print('self_unmapped = {}'.format( ub.repr2(self_unmapped, nl=1))) print('other_unmapped = {}'.format( ub.repr2(other_unmapped, nl=1))) print('len(mapping) = {}'.format( ub.repr2(len(mapping), nl=1))) print('len(self_unmapped) = {}'.format( ub.repr2(len(self_unmapped), nl=1))) print('len(other_unmapped) = {}'.format( ub.repr2(len(other_unmapped), nl=1))) print('-- end embed association --') # HACK: something might be wrong, there was an instance with # HRNet_w32 where multiple keys mapped to the same key # bad keys were incre_modules.3.0.conv1.weight and conv1.weight # # This will not error, but may produce bad output try: model_state_dict = ub.map_keys(lambda k: mapping.get(k, k), model_state_dict) except Exception as ex: HACK = 1 if HACK: new_state_dict_ = {} for k, v in model_state_dict.items(): new_state_dict_[mapping.get(k, k)] = v model_state_dict = new_state_dict_ warnings.warn('ex = {!r}'.format(ex)) else: raise else: raise KeyError(association) return model_state_dict other_state = _fix_keys(model_state_dict) self_unset_keys = set( self_state.keys()) # will end up as keys in our that were not set other_unused_keys = set(other_state.keys( )) # will end up as keys in the other model that were not used seen_keys = ub.ddict(set) for key, other_value in other_state.items(): if key not in self_state: if verbose > 0: print('Skipping {} because it does not exist'.format(key)) seen_keys['skipped'].add(key) else: self_value = self_state[key] if other_value.size() == self_value.size(): self_state[key] = other_value self_unset_keys.remove(key) other_unused_keys.remove(key) seen_keys['full_add'].add(key) elif len(other_value.size()) == len(self_value.size()): if not mangle: if verbose > 0: print( 'Skipping {} due to incompatable size and mangle=False' .format(key)) print(' * self = {!r}'.format(self_value.size())) print(' * other = {!r}'.format(other_value.size())) seen_keys['skipped'].add(key) elif key.endswith('bias'): if verbose > 0: print( 'Skipping {} due to incompatable size'.format(key)) print(' * self = {!r}'.format(self_value.size())) print(' * other = {!r}'.format(other_value.size())) seen_keys['skipped'].add(key) else: if leftover is None: if verbose > 0: print( 'Skipping {} due to incompatable size and no default initializer' .format(key)) print(' * self = {!r}'.format(self_value.size())) print(' * other = {!r}'.format(other_value.size())) seen_keys['skipped'].add(key) else: if verbose > 0: print('Partially add {} with incompatable size'. format(key)) print(' * self = {!r}'.format(self_value.size())) print(' * other = {!r}'.format(other_value.size())) # Initialize all weights in case any are unspecified if leftover is None: try: leftover(self_state[key]) except Exception: if verbose > 0: print('Unable to init {} with {}'.format( key, leftover)) # Transfer as much as possible min_size = np.minimum(self_state[key].shape, other_value.shape) sl = tuple([slice(0, s) for s in min_size]) self_state[key][sl] = other_value[sl] # if shock_partial: # # Shock weights because we are doing something weird # # might help the network recover in case this is # # not a good idea # shock(self_state[key], func=leftover) self_unset_keys.remove(key) other_unused_keys.remove(key) if self_state[key].numel() < other_value.numel(): seen_keys['partial_add_some'].add(key) else: seen_keys['partial_add_all'].add(key) else: if verbose > 0: print('Skipping {} due to incompatable size'.format(key)) print(' * self = {!r}'.format(self_value.size())) print(' * other = {!r}'.format(other_value.size())) seen_keys['skipped'].add(key) if ignore_unset is True: self_unset_keys = [] elif ignore_unset: self_unset_keys = list(ub.oset(self_unset_keys) - set(ignore_unset)) if (self_unset_keys or other_unused_keys or seen_keys['partial_add_some'] or seen_keys['partial_add_all']): if verbose > 0: if seen_keys: print('Pretrained weights are a partial fit') else: print('Pretrained weights do not fit!') if verbose > 1: print('Seen Keys: {}'.format(ub.repr2(seen_keys, nl=2))) print('Self Unset Keys: {}'.format(ub.repr2(self_unset_keys, nl=1))) print('Other Unused keys: {}'.format( ub.repr2(other_unused_keys, nl=1))) print('summary:') seen_sum = ub.map_vals(len, seen_keys) print('Seen Num: {}'.format(ub.repr2(seen_sum, nl=2))) print('Self Unset Num: {}'.format( ub.repr2(len(self_unset_keys), nl=1))) print('Other Unused Num: {}'.format( ub.repr2(len(other_unused_keys), nl=1))) if leftover: if verbose > 0: print('Initializing unused keys using {}'.format(leftover)) for key in self_unset_keys: if key.endswith('.num_batches_tracked'): pass # ignore num_batches_tracked elif key.endswith('.bias'): self_state[key].fill_(0) else: try: leftover(self_state[key]) except Exception: if verbose > 0: print('Unable to init {} with {}'.format( key, leftover)) else: if verbose > 0: print('Pretrained weights are a perfect fit') model.load_state_dict(self_state) info = { 'seen': seen_keys, 'self_unset': self_unset_keys, 'other_unused': other_unused_keys } return info
def benchmark_hash_file(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --show python ~/code/ubelt/dev/bench_hash.py --show """ import ubelt as ub import random # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) dpath = ub.ensuredir(ub.expandpath('$HOME/tmp')) rng = random.Random(0) # Create a pool of random chunks of data chunksize = int(2 ** 20) pool_size = 8 part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)] #ITEM = 'JUST A STRING' * 100 HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 10)) import os results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2 ** s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) # Write a big file size_pool = [N] fpath = _write_random_file(dpath, part_pool, size_pool, rng) megabytes = os.stat(fpath).st_size / (2 ** 20) print('megabytes = {!r}'.format(megabytes)) for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_file(fpath, hasher=hasher) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds') kwplot.show_if_requested()
def randomized_ibeis_dset(dbname, dim=416): """ CommandLine: xdoctest ~/code/netharn/netharn/examples/siam_ibeis.py randomized_ibeis_dset --show Example: >>> dbname = 'PZ_MTEST' >>> datasets = randomized_ibeis_dset(dbname) >>> # xdoctest: +REQUIRES(--show) >>> nh.util.qtensure() >>> self = datasets['train'] >>> self.show_sample() >>> nh.util.show_if_requested() """ from ibeis.algo.verif import vsone pblm = vsone.OneVsOneProblem.from_empty(dbname) # Simpler very randomized sample strategy pcc_sets = { 'train': set(), 'vali': set(), 'test': set(), } vali_frac = .0 test_frac = .1 train_frac = 1 - (vali_frac + test_frac) category_probs = ub.odict([ ('train', train_frac), ('test', test_frac), ('vali', vali_frac), ]) rng = nh.util.ensure_rng(989540621) # Gather all PCCs pccs = sorted(map(frozenset, pblm.infr.positive_components())) # Each PCC in this group has a probability of going into the # either test / train / or vali split choices = rng.choice(list(category_probs.keys()), p=list(category_probs.values()), size=len(pccs)) for key, pcc in zip(choices, pccs): pcc_sets[key].add(pcc) if __debug__: # Ensure sets of PCCs are disjoint! intersections = {} for key1, key2 in it.combinations(pcc_sets.keys(), 2): isect = pcc_sets[key1].intersection(pcc_sets[key2]) intersections[(key1, key2)] = isect num_isects = ub.map_vals(len, intersections) if any(num_isects.values()): msg = 'Splits are not disjoint: {}'.format( ub.repr2(num_isects, sk=1)) print(msg) raise AssertionError(msg) if True: num_pccs = ub.map_vals(len, pcc_sets) total = sum(num_pccs.values()) fracs = {k: v / total for k, v in num_pccs.items()} print('Splits use the following fractions of data: {}'.format( ub.repr2(fracs, precision=4))) for key, want in category_probs.items(): got = fracs[key] absdiff = abs(want - got) if absdiff > 0.1: raise AssertionError( 'Sampled fraction of {} for {!r} is significantly ' 'different than what was requested: {}'.format( got, key, want)) test_dataset = RandomBalancedIBEISSample(pblm, pcc_sets['test'], dim=dim) train_dataset = RandomBalancedIBEISSample(pblm, pcc_sets['train'], dim=dim, augment=False) vali_dataset = RandomBalancedIBEISSample(pblm, pcc_sets['vali'], dim=dim, augment=False) datasets = { 'train': train_dataset, 'vali': vali_dataset, 'test': test_dataset, } # datasets.pop('test', None) # dont test for now (speed consideration) return datasets
def freq_group(items, groupids): groups = ub.group_items(items, groupids) hist = ub.map_vals(len, groups) for k in ub.argsort(hist): yield groups[k]
def time_ondisk_crop(size=512, dim=3, region='small_random', num=24): """ Ignore: >>> from bench_subregion_imread import * # NOQA >>> import xdev >>> globals().update(xdev.get_func_kwargs(time_ondisk_crop)) """ enabled = { 'in_memory': False, 'memmap': True, 'PIL': False, 'OpenCV': True, 'VIPS': True, 'GDAL': True, 'HDF5': True, } print('\n---') # DATA = 'custom' DATA = 'random' if DATA == 'random': # data = (np.random.rand(500, 500, 3) * 255).astype(np.uint8) print('Generate random data, size={}, dim={}, mode={}'.format( size, dim, region)) x, y = np.meshgrid(np.arange(0, size), np.arange(0, size)) data = np.ascontiguousarray( (np.dstack([x] * dim) % 256).astype(np.uint8)) data = data.squeeze() elif DATA == 'custom': print('USE CUSTOM data, size={}, dim={}, mode={}'.format( size, dim, region)) assert False custom_fpath = '.ptif' import kwimage data = kwimage.imread(custom_fpath) else: raise KeyError(DATA) print('Make temp directory to prepare data') dpath = tempfile.mkdtemp() lossy_ext = ('.jpg', '.ptif', '.cog') img_fpaths = { # 'png': join(dpath, 'foo.png'), # 'jpg': join(dpath, 'foo.jpg'), # 'tif': join(dpath, 'foo.tif'), } pil_img = Image.fromarray(data) for k, v in img_fpaths.items(): print('DUMP v = {!r}'.format(v)) pil_img.save(v) img_fpaths['cog'] = join(dpath, 'foo.cog') # kwimage.imwrite(img_fpaths['cog'], data, backend='gdal', compress='LZW') kwimage.imwrite(img_fpaths['cog'], data, backend='gdal', compress='ZSTD') # imwrite_cloud_optimized_geotiff(img_fpaths['cog'], data) if DATA == 'custom': from os.path import splitext import shutil ext = splitext(custom_fpath)[1][1:] tmp_fpath = join(dpath, 'foo' + ext) shutil.copy2(custom_fpath, tmp_fpath) img_fpaths.update({ 'custom_' + ext: tmp_fpath, }) mem_fpaths = {} if enabled['memmap']: mem_fpaths = { 'npy': join(dpath, 'foo.npy'), } for key, fpath in mem_fpaths.items(): print('DUMP fpath = {!r}'.format(fpath)) np.save(fpath, data) h5_fpaths = {} if enabled['HDF5']: import h5py h5_params = { # 'basic': {}, # 'chunks': {'chunks': (32, 32, 1)}, # 'lzf': {'compression': 'lzf'}, # 'lzf_chunk32': {'compression': 'lzf', 'chunks': (32, 32, 1)}, 'lzf_chunk128': { 'compression': 'lzf', 'chunks': (128, 128, 1) }, } for key, kw in h5_params.items(): print('Dump h5 ' + key) fpath = h5_fpaths[key] = join(dpath, key + '.h5') with h5py.File(fpath, 'w') as h5_file: dset = h5_file.create_dataset('DATA', data.shape, data.dtype, **kw) dset[...] = data import netharn as nh bytes_on_disk = {} for k, v in mem_fpaths.items(): bytes_on_disk['mem_' + k] = nh.util.get_file_info(v)['filesize'] for k, v in img_fpaths.items(): bytes_on_disk['img_' + k] = nh.util.get_file_info(v)['filesize'] for k, v in h5_fpaths.items(): bytes_on_disk['hdf5_' + k] = nh.util.get_file_info(v)['filesize'] mb_on_disk = ub.map_vals(lambda x: str(round(x * 1e-6, 2)) + ' MB', bytes_on_disk) print('on-disk memory usage: ' + ub.repr2(mb_on_disk, nl=1)) result = {} def record_result(timer): ti = timer.parent val = ti.min(), ti.mean(), ti.std() result[ti.label] = val rng = np.random.RandomState() def get_index(): """ Get a subregion to load """ if region == 'small_random': # Small window size, but random location size = (172, 172) h, w = size a = rng.randint(0, data.shape[0] - h) b = rng.randint(0, data.shape[1] - w) index = tuple([slice(a, a + h), slice(b, b + w)]) elif region == 'random': a, b = sorted(rng.randint(0, data.shape[0], size=2)) c, d = sorted(rng.randint(0, data.shape[1], size=2)) index = tuple([slice(a, b + 1), slice(c, d + 1)]) elif region == 'corner': index = tuple([slice(0, 8), slice(0, 8)]) else: raise KeyError(index) # index = region if len(data.shape) > 2: index = index + tuple([slice(0, 3)]) area = (index[1].start, index[0].start, index[1].stop, index[0].stop) shape = tuple([s.stop - s.start for s in index]) return index, area, shape def TIMERIT(label): # Ensure each timer run uses the same random numbers rng.seed(0) return timerit.Timerit( num=num, bestof=1, label=label, # unit='us', unit='ms', ) print('Begin benchmarks\n') if enabled['in_memory']: for timer in TIMERIT('in-memory slice'): index, area, shape = get_index() want = data[index] with timer: got = data[index] record_result(timer) if enabled['memmap']: for key, fpath in mem_fpaths.items(): for timer in TIMERIT('np.memmap load+slice ' + key): index, area, shape = get_index() want = data[index] with timer: file1 = np.memmap(fpath, dtype=data.dtype.name, shape=data.shape, offset=128, mode='r') got = file1[index] assert np.all(got == want) record_result(timer) if enabled['memmap']: for key, fpath in mem_fpaths.items(): for timer in TIMERIT('np.load load+slice ' + key): index, area, shape = get_index() want = data[index] with timer: file2 = np.load(fpath, mmap_mode='r') got = file2[index] assert np.all(got == want) record_result(timer) if enabled['PIL']: for key, fpath in img_fpaths.items(): for timer in TIMERIT('PIL open+crop (minimal) ' + key): index, area, shape = get_index() want = data[index] with timer: core = Image.open(fpath).crop(area) record_result(timer) if enabled['PIL']: for key, fpath in img_fpaths.items(): for timer in TIMERIT('PIL open+crop+getdata+asarray ' + key): index, area, shape = get_index() want = data[index] with timer: core = Image.open(fpath).crop(area) got = np.asarray(core.getdata(), dtype=np.uint8) got.shape = shape assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) if enabled['PIL']: for key, fpath in img_fpaths.items(): for timer in TIMERIT('PIL open+asarray+slice ' + key): index, area, shape = get_index() want = data[index] with timer: got = np.asarray(Image.open(fpath))[index] assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) if enabled['OpenCV']: for key, fpath in img_fpaths.items(): for timer in TIMERIT('OpenCV imread+slice ' + key): index, area, shape = get_index() want = data[index] with timer: got = cv2.imread(fpath, flags=cv2.IMREAD_UNCHANGED)[index] if len(index) > 2: got = got[:, :, ::-1] assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) if enabled['GDAL']: for key, fpath in img_fpaths.items(): for timer in TIMERIT('GDAL subregion ' + key): index, area, shape = get_index() want = data[index] with timer: got = gdal_subregion_imread(fpath, index) assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) # pip install pyvips if enabled['VIPS']: import pyvips for key, fpath in img_fpaths.items(): for timer in TIMERIT('VIPS ' + key): index, area, shape = get_index() want = data[index] left, top = area[0:2] width, height = shape[0:2][::-1] vips_img = pyvips.Image.new_from_file( fpath, # access='sequential', access='random', # memory=False, # fail=True ) with timer: vips_sub = vips_img.crop(left, top, width, height) got = np.ndarray(buffer=vips_sub.write_to_memory(), dtype=np.uint8, shape=[ vips_sub.height, vips_sub.width, vips_sub.bands ]) assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) if enabled['HDF5']: for key, fpath in h5_fpaths.items(): for timer in TIMERIT('HDF5 ' + key): with h5py.File(fpath, 'r') as file: dset = file['DATA'] index, area, shape = get_index() want = data[index] with timer: got = dset[index] assert fpath.endswith(lossy_ext) or np.all(got == want) record_result(timer) return result
def mapping_stats(xid_to_yids): n_yids = list(ub.map_vals(len, xid_to_yids).values()) return util.stats_dict(n_yids, n_extreme=True)
def _build_index(self): """ build reverse indexes Notation: aid - Annotation ID gid - imaGe ID cid - Category ID """ # create index anns, cats, imgs = {}, {}, {} gid_to_aids = ub.ddict(set) cid_to_gids = ub.ddict(set) cid_to_aids = ub.ddict(set) # Build one-to-one self-lookup maps for cat in self.dataset.get('categories', []): cid = cat['id'] if cid in cat: warnings.warn( 'Categories have the same id in {}:\n{} and\n{}'.format( self, cats[cid], cat)) cats[cid] = cat for img in self.dataset.get('images', []): gid = img['id'] if gid in imgs: warnings.warn( 'Images have the same id in {}:\n{} and\n{}'.format( self, imgs[gid], img)) imgs[gid] = img for ann in self.dataset.get('annotations', []): aid = ann['id'] if aid in anns: warnings.warn( 'Annotations have the same id in {}:\n{} and\n{}'.format( self, anns[aid], ann)) anns[aid] = ann # Build one-to-many lookup maps for ann in anns.values(): try: aid = ann['id'] gid = ann['image_id'] cid = ann['category_id'] except KeyError: raise KeyError('Annotation does not have ids {}'.format(ann)) if not isinstance(aid, int): raise TypeError('bad aid={} type={}'.format(aid, type(aid))) if not isinstance(gid, int): raise TypeError('bad gid={} type={}'.format(gid, type(gid))) if not isinstance(cid, int): raise TypeError('bad cid={} type={}'.format(cid, type(cid))) gid_to_aids[gid].add(aid) cid_to_gids[cid].add(gid) cid_to_aids[cid].add(aid) if gid not in imgs: warnings.warn('Annotation {} in {} references ' 'unknown image_id'.format(ann, self)) if cid not in cats: warnings.warn('Annotation {} in {} references ' 'unknown category_id'.format(ann, self)) # Fix one-to-zero cases for cid in cats.keys(): if cid not in cid_to_aids: cid_to_aids[cid] = set() if cid not in cid_to_gids: cid_to_gids[cid] = set() for gid in imgs.keys(): if gid not in gid_to_aids: gid_to_aids[gid] = set() # create class members self.anns = anns self.imgs = imgs self.cats = cats self.gid_to_aids = ub.map_vals(sorted, gid_to_aids) self.cid_to_gids = ub.map_vals(sorted, cid_to_gids) self.cid_to_aids = ub.map_vals(sorted, cid_to_aids) self.name_to_cat = {cat['name']: cat for cat in self.cats.values()}
def make_baseline_truthfiles(): work_dir = ub.truepath('~/work') data_dir = ub.truepath('~/data') challenge_data_dir = join(data_dir, 'viame-challenge-2018') challenge_work_dir = join(work_dir, 'viame-challenge-2018') ub.ensuredir(challenge_work_dir) img_root = join(challenge_data_dir, 'phase0-imagery') annot_dir = join(challenge_data_dir, 'phase0-annotations') fpaths = list(glob.glob(join(annot_dir, '*.json'))) # ignore the non-bounding box nwfsc and afsc datasets for now # exclude = ('nwfsc', 'afsc', 'mouss', 'habcam') # exclude = ('nwfsc', 'afsc', 'mouss',) # fpaths = [p for p in fpaths if not basename(p).startswith(exclude)] import json dsets = ub.odict() for fpath in fpaths: key = basename(fpath).split('.')[0] dsets[key] = json.load(open(fpath, 'r')) print('Merging') merged = coco_union(dsets) merged_fpath = join(challenge_work_dir, 'phase0-merged.mscoco.json') with open(merged_fpath, 'w') as fp: json.dump(merged, fp, indent=4) import copy self = CocoDataset(copy.deepcopy(merged), img_root=img_root, autobuild=False) self._build_index() self.run_fixes() if True: # remove all point annotations print('Remove point annotations') to_remove = [] for ann in self.dataset['annotations']: if ann['roi_shape'] == 'point': to_remove.append(ann) for ann in to_remove: self.dataset['annotations'].remove(ann) self._build_index() # remove empty images print('Remove empty images') to_remove = [] for gid in self.imgs.keys(): aids = self.gid_to_aids.get(gid, []) if not aids: to_remove.append(self.imgs[gid]) for img in to_remove: self.dataset['images'].remove(img) self._build_index() print('# self.anns = {!r}'.format(len(self.anns))) print('# self.imgs = {!r}'.format(len(self.imgs))) print('# self.cats = {!r}'.format(len(self.cats))) catname_to_nannots = ub.map_keys(lambda x: self.cats[x]['name'], ub.map_vals(len, self.cid_to_aids)) catname_to_nannots = ub.odict(sorted(catname_to_nannots.items(), key=lambda kv: kv[1])) print(ub.repr2(catname_to_nannots)) if False: # aid = list(self.anns.values())[0]['id'] # self.show_annotation(aid) gids = sorted([gid for gid, aids in self.gid_to_aids.items() if aids]) # import utool as ut # for gid in ut.InteractiveIter(gids): for gid in gids: from matplotlib import pyplot as plt fig = plt.figure(1) fig.clf() self.show_annotation(gid=gid) fig.canvas.draw() for ann in self.anns.values(): primary_aid = ann['id'] print('primary_aid = {!r}'.format(primary_aid)) print(len(self.gid_to_aids[ann['image_id']])) if 'roi_shape' not in ann: ann['roi_shape'] = 'bounding_box' if ann['roi_shape'] == 'boundingBox': pass if ann['roi_shape'] == 'point': primary_aid = ann['id'] print('primary_aid = {!r}'.format(primary_aid)) print(len(self.gid_to_aids[ann['image_id']])) break # Split into train / test set print('Splitting') skf = StratifiedGroupKFold(n_splits=2) groups = [ann['image_id'] for ann in self.anns.values()] y = [ann['category_id'] for ann in self.anns.values()] X = [ann['id'] for ann in self.anns.values()] split = list(skf.split(X=X, y=y, groups=groups))[0] train_idx, test_idx = split print('Taking subsets') aid_to_gid = {aid: ann['image_id'] for aid, ann in self.anns.items()} train_aids = list(ub.take(X, train_idx)) test_aids = list(ub.take(X, test_idx)) train_gids = sorted(set(ub.take(aid_to_gid, train_aids))) test_gids = sorted(set(ub.take(aid_to_gid, test_aids))) train_dset = self.subset(train_gids) test_dset = self.subset(test_gids) print('---------') print('# train_dset.anns = {!r}'.format(len(train_dset.anns))) print('# train_dset.imgs = {!r}'.format(len(train_dset.imgs))) print('# train_dset.cats = {!r}'.format(len(train_dset.cats))) print('---------') print('# test_dset.anns = {!r}'.format(len(test_dset.anns))) print('# test_dset.imgs = {!r}'.format(len(test_dset.imgs))) print('# test_dset.cats = {!r}'.format(len(test_dset.cats))) train_dset._ensure_imgsize() test_dset._ensure_imgsize() print('Writing') with open(join(challenge_work_dir, 'phase0-merged-train.mscoco.json'), 'w') as fp: json.dump(train_dset.dataset, fp, indent=4) with open(join(challenge_work_dir, 'phase0-merged-test.mscoco.json'), 'w') as fp: json.dump(test_dset.dataset, fp, indent=4) # Make a detectron yaml file config_text = ub.codeblock( """ MODEL: TYPE: generalized_rcnn CONV_BODY: ResNet.add_ResNet50_conv4_body NUM_CLASSES: {num_classes} FASTER_RCNN: True NUM_GPUS: 1 SOLVER: WEIGHT_DECAY: 0.0001 LR_POLICY: steps_with_decay BASE_LR: 0.01 GAMMA: 0.1 # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) MAX_ITER: 180000 STEPS: [0, 120000, 160000] RPN: SIZES: (32, 64, 128, 256, 512) FAST_RCNN: ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head ROI_XFORM_METHOD: RoIAlign TRAIN: WEIGHTS: https://s3-us-west-2.amazonaws.com/detectron/ImageNetPretrained/MSRA/R-50.pkl DATASETS: ('/work/viame-challenge-2018/phase0-merged-train.mscoco.json',) IM_DIR: '/data/viame-challenge-2018/phase0-imagery' SCALES: (800,) MAX_SIZE: 1333 IMS_PER_BATCH: 1 BATCH_SIZE_PER_IM: 512 TEST: DATASETS: ('/work/viame-challenge-2018/phase0-merged-test.mscoco.json',) IM_DIR: '/data/viame-challenge-2018/phase0-imagery' SCALES: (800,) MAX_SIZE: 1333 NMS: 0.5 RPN_PRE_NMS_TOP_N: 6000 RPN_POST_NMS_TOP_N: 1000 OUTPUT_DIR: /work/viame-challenge-2018/output """) config_text = config_text.format( num_classes=len(self.cats), ) ub.writeto(join(challenge_work_dir, 'phase0-faster-rcnn.yaml'), config_text) docker_cmd = ('nvidia-docker run ' '-v {work_dir}:/work -v {data_dir}:/data ' '-it detectron:c2-cuda9-cudnn7 bash').format( work_dir=work_dir, data_dir=data_dir) train_cmd = ('python2 tools/train_net.py ' '--cfg /work/viame-challenge-2018/phase0-faster-rcnn.yaml ' 'OUTPUT_DIR /work/viame-challenge-2018/output') hacks = ub.codeblock( """ git remote add Erotemic https://github.com/Erotemic/Detectron.git git fetch --all git checkout general_dataset # curl https://github.com/Erotemic/Detectron/blob/42d44b2d155c775dc509b6a44518d0c582f8cdf5/tools/train_net.py # wget https://github.com/Erotemic/Detectron/blob/42d44b2d155c775dc509b6a44518d0c582f8cdf5/lib/core/config.py """) print(docker_cmd) print(train_cmd)
def fix_annotmatch_pzmaster1(): """ PZ_Master1 had annotmatch rowids that did not agree with the current name labeling. Looking at the inconsistencies in the graph interface was too cumbersome, because over 3000 annots were incorrectly grouped together. This function deletes any annotmatch rowid that is not consistent with the current labeling so we can go forward with using the new AnnotInference object """ import wbia ibs = wbia.opendb('PZ_Master1') infr = wbia.AnnotInference(ibs=ibs, aids=ibs.get_valid_aids(), verbose=5) infr.initialize_graph() annots = ibs.annots() aid_to_nid = ut.dzip(annots.aids, annots.nids) if False: infr.reset_feedback() infr.ensure_mst() infr.apply_feedback_edges() infr.relabel_using_reviews() infr.start_qt_interface() # Get annotmatch rowids that agree with current labeling if False: annotmatch = ibs.db.get_table_as_pandas('annotmatch') import pandas as pd flags1 = pd.isnull(annotmatch['annotmatch_evidence_decision']) flags2 = annotmatch['annotmatch_tag_text'] == '' bad_part = annotmatch[flags1 & flags2] rowids = bad_part.index.tolist() ibs.delete_annotmatch(rowids) if False: # Delete bidirectional annotmatches annotmatch = ibs.db.get_table_as_pandas('annotmatch') df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2']) # Find entires that have both directions pairs1 = annotmatch[['annot_rowid1', 'annot_rowid2']].values f_edges = {tuple(p) for p in pairs1} b_edges = {tuple(p[::-1]) for p in pairs1} isect_edges = {tuple(sorted(p)) for p in b_edges.intersection(f_edges)} isect_edges1 = list(isect_edges) isect_edges2 = [p[::-1] for p in isect_edges] # cols = ['annotmatch_evidence_decision', 'annotmatch_tag_text'] import pandas as pd custom_ = { (559, 4909): (False, ['photobomb']), (7918, 8041): (False, ['photobomb']), (6634, 6754): (False, ['photobomb']), (3707, 3727): (False, ['photobomb']), (86, 103): (False, ['photobomb']), } extra_ = {} fixme_edges = [] d1 = df.loc[isect_edges1].reset_index(drop=False) d2 = df.loc[isect_edges2].reset_index(drop=False) flags = d1['annotmatch_evidence_decision'] != d2[ 'annotmatch_evidence_decision'] from wbia.tag_funcs import _parse_tags for f, r1, r2 in zip(flags, d1.iterrows(), d2.iterrows()): v1, v2 = r1[1], r2[1] aid1 = v1['annot_rowid1'] aid2 = v1['annot_rowid2'] truth_real = (ibs.const.EVIDENCE_DECISION.POSITIVE if aid_to_nid[aid1] == aid_to_nid[aid2] else ibs.const.EVIDENCE_DECISION.NEGATIVE) truth1 = v1['annotmatch_evidence_decision'] truth2 = v2['annotmatch_evidence_decision'] t1 = _parse_tags(v1['annotmatch_tag_text']) t2 = _parse_tags(v2['annotmatch_tag_text']) newtag = ut.union_ordered(t1, t2) if (aid1, aid2) in custom_: continue fixme_flag = False if not pd.isnull(truth1): if truth_real != truth1: fixme_flag = True if not pd.isnull(truth2): if truth_real != truth2: fixme_flag = True if fixme_flag: logger.info('newtag = %r' % (newtag, )) logger.info('truth_real = %r' % (truth_real, )) logger.info('truth1 = %r' % (truth1, )) logger.info('truth2 = %r' % (truth2, )) logger.info('aid1 = %r' % (aid1, )) logger.info('aid2 = %r' % (aid2, )) fixme_edges.append((aid1, aid2)) else: extra_[(aid1, aid2)] = (truth_real, newtag) extra_.update(custom_) new_pairs = extra_.keys() new_truths = ut.take_column(ut.dict_take(extra_, new_pairs), 0) new_tags = ut.take_column(ut.dict_take(extra_, new_pairs), 1) new_tag_texts = [';'.join(t) for t in new_tags] aids1, aids2 = ut.listT(new_pairs) # Delete the old ibs.delete_annotmatch((d1['annotmatch_rowid'].values.tolist() + d2['annotmatch_rowid'].values.tolist())) # Add the new ams = ibs.add_annotmatch_undirected(aids1, aids2) ibs.set_annotmatch_evidence_decision(ams, new_truths) ibs.set_annotmatch_tag_text(ams, new_tag_texts) if False: import wbia.guitool as gt gt.ensure_qapp() ut.qtensure() from wbia.gui import inspect_gui inspect_gui.show_vsone_tuner(ibs, aid1, aid2) # pairs2 = pairs1.T[::-1].T # idx1, idx2 = ut.isect_indices(list(map(tuple, pairs1)), # list(map(tuple, pairs2))) # r_edges = list(set(map(tuple, map(sorted, pairs1[idx1])))) # unique_pairs = list(set(map(tuple, map(sorted, pairs1[idx1])))) # df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2']) x = ut.ddict(list) annotmatch = ibs.db.get_table_as_pandas('annotmatch') import ubelt as ub _iter = annotmatch.iterrows() prog = ub.ProgIter(_iter, length=len(annotmatch)) for k, m in prog: aid1 = m['annot_rowid1'] aid2 = m['annot_rowid2'] if m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.POSITIVE: if aid_to_nid[aid1] == aid_to_nid[aid2]: x['agree1'].append(k) else: x['disagree1'].append(k) elif m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.NEGATIVE: if aid_to_nid[aid1] == aid_to_nid[aid2]: x['disagree2'].append(k) else: x['agree2'].append(k) ub.map_vals(len, x) ut.dict_hist(annotmatch.loc[x['disagree1']]['annotmatch_tag_text']) disagree1 = annotmatch.loc[x['disagree1']] pb_disagree1 = disagree1[disagree1['annotmatch_tag_text'] == 'photobomb'] aids1 = pb_disagree1['annot_rowid1'].values.tolist() aids2 = pb_disagree1['annot_rowid2'].values.tolist() aid_pairs = list(zip(aids1, aids2)) infr = wbia.AnnotInference.from_pairs(aid_pairs, ibs=ibs, verbose=5) if False: feedback = infr.read_wbia_annotmatch_feedback(edges=infr.edges()) infr.external_feedback = feedback infr.apply_feedback_edges() infr.start_qt_interface(loop=False) # Delete these values if False: nonpb_disagree1 = disagree1[ disagree1['annotmatch_tag_text'] != 'photobomb'] disagree2 = annotmatch.loc[x['disagree2']] ibs.delete_annotmatch(nonpb_disagree1['annotmatch_rowid']) ibs.delete_annotmatch(disagree2['annotmatch_rowid']) # ut.dict_hist(disagree1['annotmatch_tag_text']) import networkx as nx graph = nx.Graph() graph.add_edges_from( zip(pb_disagree1['annot_rowid1'], pb_disagree1['annot_rowid2'])) list(nx.connected_components(graph)) set(annotmatch.loc[x['disagree2']]['annotmatch_tag_text'])
def coerce_datasets(config, build_hashid=False, verbose=1): """ Coerce train / val / test datasets from standard netharn config keys TODO: * Does this belong in netharn? This only looks at the following keys in config: * datasets * train_dataset * vali_dataset * test_dataset Example: >>> import kwcoco >>> import ndsampler.coerce_data >>> config = {'datasets': 'special:shapes'} >>> print('config = {!r}'.format(config)) >>> dsets = ndsampler.coerce_data.coerce_datasets(config) >>> print('dsets = {!r}'.format(dsets)) >>> config = {'datasets': 'special:shapes256'} >>> ndsampler.coerce_data.coerce_datasets(config) >>> config = { >>> 'datasets': kwcoco.CocoDataset.demo('shapes'), >>> } >>> coerce_datasets(config) >>> coerce_datasets({ >>> 'datasets': kwcoco.CocoDataset.demo('shapes'), >>> 'test_dataset': kwcoco.CocoDataset.demo('photos'), >>> }) >>> coerce_datasets({ >>> 'datasets': kwcoco.CocoDataset.demo('shapes'), >>> 'test_dataset': kwcoco.CocoDataset.demo('photos'), >>> }) """ # Ideally the user specifies a standard train/vali/test split def _rectify_fpath(key): fpath = key fpath = fpath.lstrip('path:').lstrip('PATH:') fpath = ub.expandpath(fpath) return fpath def _ensure_coco(coco): # Map a file path or an in-memory dataset to a CocoDataset import kwcoco import six from os.path import exists if coco is None: return None elif isinstance(coco, six.string_types): fpath = _rectify_fpath(coco) if exists(fpath): with ub.Timer( 'read kwcoco dataset: fpath = {!r}'.format(fpath)): coco = kwcoco.CocoDataset(fpath, autobuild=False) print('building kwcoco index') coco._build_index() else: if not coco.lower().startswith('special:'): import warnings warnings.warn('warning start dataset codes with special:') code = coco else: code = coco.lower()[len('special:'):] coco = kwcoco.CocoDataset.demo(code) else: # print('live dataset') assert isinstance(coco, kwcoco.CocoDataset) return coco config = config.copy() subsets = { 'train': config.get('train_dataset', None), 'vali': config.get('vali_dataset', None), 'test': config.get('test_dataset', None), } # specifying any train / vali / test disables datasets if any(d is not None for d in subsets.values()): config['datasets'] = None if verbose: print('[ndsampler.coerce_data] Checking for explicit subsets') subsets = ub.map_vals(_ensure_coco, subsets) # However, sometimes they just specify a single dataset, and we need to # make a split for it. # print('config = {!r}'.format(config)) base = _ensure_coco(config.get('datasets', None)) print('[ndsampler.coerce_data] base = {!r}'.format(base)) if base is not None: if verbose: print('Splitting base into train/vali') # TODO: the actual split may need to be cached. factor = config.get('split_factor', 3) split_gids = _split_train_vali_test(base, factor=factor) if config.get('no_test', False): split_gids['train'] += split_gids.pop('test') for tag in split_gids.keys(): gids = split_gids[tag] subset = base.subset(sorted(gids), copy=True) subset.tag = base.tag + '-' + tag subsets[tag] = subset subsets = {k: v for k, v in subsets.items() if v is not None} if build_hashid: print('Building subset hashids') for tag, subset in subsets.items(): print('Build index for {}'.format(subset.tag)) subset._build_index() print('Build hashid for {}'.format(subset.tag)) subset._build_hashid(hash_pixels=False, verbose=10) # if verbose: # print(_catfreq_columns_str(subsets)) return subsets
def __init__(api): api.base = 'https://pogoapi.net/api/v1/' api.routes = { 'pokemon_stats': api.base + 'pokemon_stats.json', 'current_pokemon_moves': api.base + 'current_pokemon_moves.json', 'pokemon_evolutions': api.base + 'pokemon_evolutions.json', 'cp_multiplier': api.base + 'cp_multiplier.json', 'pokemon_types': api.base + 'pokemon_types.json', 'charged_moves': api.base + 'charged_moves.json', 'fast_moves': api.base + 'fast_moves.json', 'type_effectiveness': api.base + 'type_effectiveness.json', 'pokemon_powerup_requirements': api.base + 'pokemon_powerup_requirements.json', 'pokemon_candy_to_evolve': api.base + 'pokemon_candy_to_evolve.json', 'pokemon_buddy_distances': api.base + 'pokemon_buddy_distances.json', 'shadow_pokemon': api.base + 'shadow_pokemon.json', 'pokemon_forms': api.base + 'pokemon_forms.json', 'pvp_exclusive_pokemon': api.base + 'pvp_exclusive_pokemon.json', 'galarian_pokemon': api.base + 'galarian_pokemon.json', 'alolan_pokemon': api.base + 'alolan_pokemon.json', 'shiny_pokemon': api.base + 'shiny_pokemon.json', 'mega_pokemon': api.base + 'mega_pokemon.json', 'baby_pokemon': api.base + 'baby_pokemon.json', 'nesting_pokemon': api.base + 'nesting_pokemon.json', 'released_pokemon': api.base + 'released_pokemon.json', 'pokemon_names': api.base + 'pokemon_names.json', 'api_hashes': api.base + 'api_hashes.json', 'pvp_fast_moves': api.base + 'pvp_fast_moves.json', 'pvp_charged_moves': api.base + 'pvp_charged_moves.json', } # TODO: determine when to redownload api.data = {} for key, url in api.routes.items(): redo = 0 data_fpath = ub.grabdata(url, verbose=1, redo=redo, expires=24 * 60 * 60) with open(data_fpath, 'r') as file: data = json.load(file) api.data[key] = data # Make the API global for now pokemon_stats = api.data['pokemon_stats'] _name_to_stats = ub.group_items( pokemon_stats, lambda item: item['pokemon_name'].lower()) _name_to_stats = dict(_name_to_stats) api.name_to_stats = _name_to_stats _name_to_moves = ub.group_items( api.data['current_pokemon_moves'], lambda item: item['pokemon_name'].lower()) _name_to_moves.default_factory = None _name_to_moves = dict(_name_to_moves) # base = 'http://pokeapi.co/api/v2/pokemon/' api.name_to_moves = _name_to_moves evolutions = api.data['pokemon_evolutions'] _name_to_evolutions = ub.group_items( evolutions, lambda item: item['pokemon_name'].lower()) _name_to_evolutions = dict(_name_to_evolutions) for key, form_stats in api.name_to_stats.items(): if key not in _name_to_evolutions: noevos = [] for s in form_stats: empty = ub.dict_isect( s, {'form', 'pokemon_name', 'pokemon_id'}) empty['evolutions'] = [] noevos.append(empty) _name_to_evolutions[key] = noevos _name_to_types = ub.group_items( api.data['pokemon_types'], lambda item: item['pokemon_name'].lower()) _name_to_types = dict(_name_to_types) api.name_to_type = _name_to_types evo_graph = nx.DiGraph() for name, form_evo_list in _name_to_evolutions.items(): for form_evo in form_evo_list: u = form_evo['pokemon_name'].lower() evo_graph.add_node(u) for evo in form_evo['evolutions']: v = evo['pokemon_name'].lower() evo_graph.add_edge(u, v) api.name_to_family = {} api.name_to_base = {} evo_graph.remove_edges_from(nx.selfloop_edges(evo_graph)) api.evo_graph = evo_graph for cc in list(nx.connected_components(api.evo_graph.to_undirected())): bases = [n for n in cc if len(evo_graph.pred[n]) == 0] base = bases[0] for n in cc: api.name_to_family[n] = cc api.name_to_base[n] = base api.name_to_evolutions = _name_to_evolutions api.pve_fast_moves = ub.group_items( api.data['fast_moves'], lambda item: normalize(item['name'].lower())) api.pve_fast_moves.default_factory = None api.pve_charged_moves = ub.group_items( api.data['charged_moves'], lambda item: normalize(item['name'].lower())) api.pve_charged_moves.default_factory = None api.pvp_fast_moves = ub.group_items( api.data['pvp_fast_moves'], lambda item: normalize(item['name'].lower())) api.pvp_fast_moves.default_factory = None api.pvp_charged_moves = ub.group_items( api.data['pvp_charged_moves'], lambda item: normalize(item['name'].lower())) api.pvp_charged_moves.default_factory = None if 0: ub.map_vals(len, api.pve_fast_moves) ub.map_vals(len, api.pve_charged_moves) api.learnable = { # TODO: remove 'stunfisk_galarian': { 'fast': [ 'MUD_SHOT', 'METAL_CLAW', ], 'charge': [ 'EARTHQUAKE', 'FLASH_CANNON', 'MUDDY_WATER', 'ROCK_SLIDE', ] } }
def instance_fscore(gti, uncertain, dsm, pred, info=False): """ path = '/home/local/KHQ/jon.crall/data/work/urban_mapper/eval/input_4224-rwyxarza/solver_4214-yxalqwdk_unet_vgg_nttxoagf_a=1,n_ch=5,n_cl=3/_epoch_00000236/restiched/pred' path = ub.truepath( '~/remote/aretha/data/work/urban_mapper2/test/input_4224-exkudlzu/' 'solver_4214-guwsobde_unet_mmavmuou_eqnoygqy_a=1,c=RGB,n_ch=5,n_cl=4/' '_epoch_00000154/restiched/pred') mode_paths = sorted(glob.glob(path + '/*.png')) def instance_label(pred, k=15, n_iters=1, dist_thresh=5, watershed=False): mask = pred # noise removal if k > 1 and n_iters > 0: kernel = np.ones((k, k), np.uint8) mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=n_iters) if watershed: from clab.torch import filters mask = filters.watershed_filter(mask, dist_thresh=dist_thresh) mask = mask.astype(np.uint8) n_ccs, cc_labels = cv2.connectedComponents(mask, connectivity=4) return cc_labels from clab.tasks.urban_mapper_3d import UrbanMapper3D task = UrbanMapper3D('', '') fscores = [] for pred_fpath in ub.ProgIter(mode_paths): pass gtl_fname = basename(pred_fpath).replace('.png', '_GTL.tif') gti_fname = basename(pred_fpath).replace('.png', '_GTI.tif') dsm_fname = basename(pred_fpath).replace('.png', '_DSM.tif') bgr_fname = basename(pred_fpath).replace('.png', '_RGB.tif') gtl_fpath = join(ub.truepath('~/remote/aretha/data/UrbanMapper3D/training/'), gtl_fname) gti_fpath = join(ub.truepath('~/remote/aretha/data/UrbanMapper3D/training/'), gti_fname) dsm_fpath = join(ub.truepath('~/remote/aretha/data/UrbanMapper3D/training/'), dsm_fname) bgr_fpath = join(ub.truepath('~/remote/aretha/data/UrbanMapper3D/training/'), bgr_fname) pred_seg = util.imread(pred_fpath) pred = instance_label2(pred_seg, dist_thresh=d, k=k, watershed=True) gti = util.imread(gti_fpath) gtl = util.imread(gtl_fpath) dsm = util.imread(dsm_fpath) bgr = util.imread(bgr_fpath) uncertain = (gtl == 65) fscore = instance_fscore(gti, uncertain, dsm, pred) fscores.append(fscore) print('k = {!r}'.format(k)) print('d = {!r}'.format(d)) print(np.mean(fscores)) from clab import profiler instance_fscore_ = dynamic_profile(instance_fscore) fscore = instance_fscore_(gti, uncertain, dsm, pred) instance_fscore_.profile.profile.print_stats() """ def _bbox(arr): # r1, c1, r2, c2 return np.hstack([arr.min(axis=0), arr.max(axis=0)]) def cc_locs(ccs): rc_locs = np.where(ccs > 0) rc_ids = ccs[rc_locs] rc_arr = np.ascontiguousarray(np.vstack(rc_locs).T) unique_labels, groupxs = util.group_indices(rc_ids) grouped_arrs = util.apply_grouping(rc_arr, groupxs, axis=0) id_to_rc = ub.odict(zip(unique_labels, grouped_arrs)) return id_to_rc, unique_labels, groupxs, rc_arr (true_rcs_arr, group_true_labels, true_groupxs, true_rc_arr) = cc_locs(gti) (pred_rcs_arr, group_pred_labels, pred_groupxs, pred_rc_arr) = cc_locs(pred) DSM_NAN = -32767 MIN_SIZE = 100 MIN_IOU = 0.45 # H, W = pred.shape[0:2] # --- Find uncertain truth --- # any gt-building explicitly labeled in the GTL is uncertain uncertain_labels = set(np.unique(gti[uncertain.astype(np.bool)])) # Any gt-building less than 100px or at the boundary is uncertain. for label, rc_arr in true_rcs_arr.items(): if len(rc_arr) < MIN_SIZE: rc_arr = np.array(list(rc_arr)) if (np.any(rc_arr == 0) or np.any(rc_arr == 2047)): uncertain_labels.add(label) else: rc_loc = tuple(rc_arr.T) is_invisible = (dsm[rc_loc] == DSM_NAN) if np.any(is_invisible): invisible_rc = rc_arr.compress(is_invisible, axis=0) invisible_rc_set = set(map(tuple, invisible_rc)) # Remove invisible pixels remain_rc_set = list( set(map(tuple, rc_arr)).difference(invisible_rc_set)) true_rcs_arr[label] = np.array(remain_rc_set) uncertain_labels.add(label) def make_int_coords(rc_arr, unique_labels, groupxs): # using nums instead of tuples gives the intersection a modest speedup rc_int = rc_arr.T[0] + pred.shape[0] + rc_arr.T[1] id_to_rc_int = ub.odict( zip(unique_labels, map(set, util.apply_grouping(rc_int, groupxs)))) return id_to_rc_int # Make intersection a bit faster by filtering via bbox fist true_rcs_bbox = ub.map_vals(_bbox, true_rcs_arr) pred_rcs_bbox = ub.map_vals(_bbox, pred_rcs_arr) true_bboxes = np.array(list(true_rcs_bbox.values())) pred_bboxes = np.array(list(pred_rcs_bbox.values())) candidate_matches = {} for plabel, pb in zip(group_pred_labels, pred_bboxes): irc1 = np.maximum(pb[0:2], true_bboxes[:, 0:2]) irc2 = np.minimum(pb[2:4], true_bboxes[:, 2:4]) irc1 = np.minimum(irc1, irc2, out=irc1) isect_area = np.prod(np.abs(irc2 - irc1), axis=1) tlabels = list(ub.take(group_true_labels, np.where(isect_area)[0])) candidate_matches[plabel] = set(tlabels) # using nums instead of tuples gives the intersection a modest speedup pred_rcs_ = make_int_coords(pred_rc_arr, group_pred_labels, pred_groupxs) true_rcs_ = make_int_coords(true_rc_arr, group_true_labels, true_groupxs) # Greedy matching unused_true_rcs = true_rcs_.copy() FP = TP = FN = 0 unused_true_keys = set(unused_true_rcs.keys()) assignment = [] fp_labels = [] fn_labels = [] tp_labels = [] for pred_label, pred_rc_set in pred_rcs_.items(): best_score = (-np.inf, -np.inf) best_label = None # Only check unused true labels that intersect with the predicted bbox true_cand = candidate_matches[pred_label] & unused_true_keys for true_label in true_cand: true_rc_set = unused_true_rcs[true_label] n_isect = len(pred_rc_set.intersection(true_rc_set)) iou = n_isect / (len(true_rc_set) + len(pred_rc_set) - n_isect) if iou > MIN_IOU: score = (iou, -true_label) if score > best_score: best_score = score best_label = true_label if best_label is not None: assignment.append((pred_label, best_label, best_score[0])) unused_true_keys.remove(best_label) if true_label not in uncertain_labels: TP += 1 tp_labels.append((pred_label, best_label, best_score[0])) else: FP += 1 fp_labels.append(pred_label) # Had two bugs: # * used wrong variable to count false negs (all true were labeled as FN) # (massivly increasing FN) # * Certain true building as marked as uncertain, but I was checking # against the pred labels instead (possibly decreasing/increasing TP) fn_labels = unused_true_keys - uncertain_labels # NOQA FN = len(fn_labels) precision = TP / (TP + FP) if TP > 0 else 0 recall = TP / (TP + FN) if TP > 0 else 0 if precision > 0 and recall > 0: f_score = 2 * precision * recall / (precision + recall) else: f_score = 0 # They multiply by 1e6, but lets not do that. if info: infod = { 'assign': assignment, 'tp': tp_labels, 'fp': fp_labels, 'fn': fn_labels, 'uncertain': uncertain_labels, } return (f_score, precision, recall), infod return (f_score, precision, recall)