def _cm_breaking(infr, cm_list=None, review_cfg={}): """ >>> review_cfg = {} """ if cm_list is None: cm_list = infr.cm_list ranks_top = review_cfg.get('ranks_top', None) ranks_bot = review_cfg.get('ranks_bot', None) # Construct K-broken graph edges = [] if ranks_bot is None: ranks_bot = 0 for count, cm in enumerate(cm_list): score_list = cm.annot_score_list rank_list = ub.argsort(score_list)[::-1] sortx = ub.argsort(rank_list) top_sortx = sortx[:ranks_top] bot_sortx = sortx[len(sortx) - ranks_bot:] short_sortx = list(ub.unique(top_sortx + bot_sortx)) daid_list = list(ub.take(cm.daid_list, short_sortx)) for daid in daid_list: u, v = (cm.qaid, daid) if v < u: u, v = v, u edges.append((u, v)) return edges
def suggest_spelling_correction(name, all_names, top=10): import xdev distances = xdev.edit_distance(name, all_names) idxs = ub.argsort(distances)[0:top] candidates = list(ub.take(all_names, idxs)) print('did you mean on of: {}?'.format(ub.repr2(candidates, nl=1))) return candidates
def _build_index(self): """ construct lookup tables """ # Most of the categories should have been given integer ids max_id = max( it.chain([0], nx.get_node_attributes(self.graph, 'id').values())) # Fill in id-values for any node that doesn't have one node_to_id = {} for node, attrs in sorted(self.graph.nodes.items()): node_to_id[node] = attrs.get('id', max_id + 1) max_id = max(max_id, node_to_id[node]) id_to_node = ub.invert_dict(node_to_id) # Compress ids into a flat index space (sorted by node ids) idx_to_node = ub.argsort(node_to_id) node_to_idx = {node: idx for idx, node in enumerate(idx_to_node)} # Find the sets of nodes that need to be softmax-ed together node_groups = list(traverse_siblings(self.graph)) idx_groups = [ sorted([node_to_idx[n] for n in group]) for group in node_groups ] # Set instance attributes self.id_to_node = id_to_node self.node_to_id = node_to_id self.idx_to_node = idx_to_node self.node_to_idx = node_to_idx self.idx_groups = idx_groups
def from_data(xpu, item, **kwargs): """ Creates an XPU to represent the processing device a Tensor or Variable is on Example: >>> xpu = XPU.from_data(torch.randn(3)) >>> assert not xpu.is_gpu() >>> if torch.cuda.is_available(): >>> xpu = XPU.from_data(torch.randn(3).cuda()) >>> assert xpu.is_gpu() >>> for i in range(torch.cuda.device_count()): >>> xpu = XPU.from_data(torch.randn(3).cuda(i)) >>> assert xpu.is_gpu() >>> assert xpu.main_device == i """ if hasattr(item, 'is_cuda'): if item.is_cuda: return XPU(item.get_device()) else: return XPU(None) elif hasattr(item, 'state_dict'): state_dict = item.state_dict() hist = ub.dict_hist(v.get_device() if v.is_cuda else None for v in state_dict.values()) device_num = ub.argsort(hist)[-1] return XPU(device_num) else: raise TypeError(type(item))
def get_summary(self, profile_block_list, maxlines=20): """ Args: profile_block_list (List[str]): maxlines (int): Returns: str: References: https://github.com/rkern/line_profiler """ import ubelt as ub time_list = [self.get_block_totaltime(block) for block in profile_block_list] time_list = [time if time is not None else -1 for time in time_list] @ub.memoize def readlines(fpath): return open(fpath, 'r').readlines() blockid_list = [self.get_block_id(block, readlines=readlines) for block in profile_block_list] sortx = ub.argsort(time_list) sorted_time_list = list(ub.take(time_list, sortx)) sorted_blockid_list = list(ub.take(blockid_list, sortx)) aligned_blockid_list = _align_lines(sorted_blockid_list, ':') summary_lines = [('%6.2f seconds - ' % time) + line for time, line in zip(sorted_time_list, aligned_blockid_list)] summary_text = '\n'.join(summary_lines[-maxlines:]) return summary_text
def _BROKEN_rank_epochs(monitor): """ FIXME: broken - implement better rank aggregation with custom weights Example: >>> monitor = demodata_monitor() >>> monitor._BROKEN_rank_epochs() """ rankings = {} for key, value in monitor.best_epochs(smooth=False).items(): rankings[key + '_raw'] = value for key, value in monitor.best_epochs(smooth=True).items(): rankings[key + '_smooth'] = value # borda-like weighted rank aggregation. # probably could do something better. epoch_to_weight = ub.ddict(lambda: 0) for key, ranking in rankings.items(): # weights = np.linspace(0, 1, num=len(ranking))[::-1] weights = np.logspace(0, 2, num=len(ranking))[::-1] / 100 for epoch, w in zip(ranking, weights): epoch_to_weight[epoch] += w agg_ranking = ub.argsort(epoch_to_weight)[::-1] return agg_ranking
def find_unused_gpu(min_memory=0): """ Finds GPU with the lowest memory usage by parsing output of nvidia-smi Args: min_memory (int): disregards GPUs with fewer than `min_memory` free MB Returns: int or None: gpu num if a match is found otherwise None CommandLine: python -c "from netharn import device; print(device.find_unused_gpu(300))" Example: >>> if torch.cuda.is_available(): >>> item = find_unused_gpu() >>> assert item is None or isinstance(item, int) """ gpus = gpu_info() if not gpus: return None gpu_avail_mem = {n: gpu['mem_avail'] for n, gpu in gpus.items()} usage_order = ub.argsort(gpu_avail_mem) gpu_num = usage_order[-1] if gpu_avail_mem[gpu_num] < min_memory: return None else: return gpu_num
def get_summary(self, profile_block_list, maxlines=20): """ References: https://github.com/rkern/line_profiler """ time_list = [self.get_block_totaltime(block) for block in profile_block_list] time_list = [time if time is not None else -1 for time in time_list] blockid_list = [self.get_block_id(block) for block in profile_block_list] sortx = ub.argsort(time_list) sorted_time_list = list(ub.take(time_list, sortx)) sorted_blockid_list = list(ub.take(blockid_list, sortx)) import utool as ut aligned_blockid_list = ut.util_str.align_lines(sorted_blockid_list, ':') summary_lines = [('%6.2f seconds - ' % time) + line for time, line in zip(sorted_time_list, aligned_blockid_list)] #summary_header = ut.codeblock( # ''' # CLEANED PROFILE OUPUT # The Pystone timings are not from kernprof, so they may include kernprof # overhead, whereas kernprof timings do not (unless the line being # profiled is also decorated with kernrof) # The kernprof times are reported in Timer Units # ''') # summary_lines_ = ut.listclip(summary_lines, maxlines, fromback=True) summary_text = '\n'.join(summary_lines[-maxlines:]) return summary_text
def fix_conference_places(bibman): pubman = constants_tex_fixes.PubManager() needed = set() for entry in bibman.cleaned.values(): if entry['pub_type'] == 'conference': accro, year = (entry['pub_accro'], entry['year']) pub = pubman.find(accro) if pub.places is None or int(year) not in pub.places: needed.add((accro, year)) else: place = pub.places[int(year)] print('place = {!r}'.format(place)) entry['address'] = place if needed: needed = list(needed) used_years = ub.group_items(needed, ut.take_column(needed, 0)) for k, v in list(used_years.items()): used_years[k] = sorted(v) sortby = ub.map_vals(lambda vs: (len(vs), max(e[1] for e in vs)), used_years) used_years = ut.order_dict_by(used_years, ub.argsort(sortby)) print('NEED CONFERENCE LOCATIONS') print(ub.repr2(used_years, nl=2))
def _sort_itemstrs(items, itemstrs): """ Equivalent to `sorted(items)` except if `items` are unorderable, then string values are used to define an ordering. """ # First try to sort items by their normal values # If that doesnt work, then sort by their string values import ubelt as ub try: # Set ordering is not unique. Sort by strings values instead. if _peek_isinstance(items, (set, frozenset)): raise TypeError sortx = ub.argsort(items) except TypeError: sortx = ub.argsort(itemstrs) itemstrs = [itemstrs[x] for x in sortx] return itemstrs
def _convert_dict(data): try: ordered_ = sorted(data.items()) # what raises a TypeError differs between Python 2 and 3 except TypeError: import ubelt as ub sortx = ub.argsort(data, key=str) ordered_ = [(k, data[k]) for k in sortx] hashable = b''.join(_hashable_sequence(ordered_, extensions=self)) prefix = b'DICT' return prefix, hashable
def bench_closures(): """ Is it faster to use a closure or pass in the variables explicitly? """ import ubelt as ub import timerit import numpy as np # Test a nested func with vs without a closure def rand_complex(*shape): real = np.random.rand(*shape).astype(np.complex) imag = np.random.rand(*shape).astype(np.complex) * 1j mat = real + imag return mat s = int(ub.argval('--s', default='1')) mat1 = rand_complex(s, s) mat2 = rand_complex(s, s) N = 1000 offset = 100 def nested_closure(): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset def nested_explicit(mat1, mat2, N, offset): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset ti = timerit.Timerit(int(2**11), bestof=int(2**8), verbose=int(ub.argval('--verbose', default='1'))) for timer in ti.reset('nested_explicit'): with timer: nested_explicit(mat1, mat2, N, offset) for timer in ti.reset('nested_closure'): with timer: nested_closure() print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def _convert_set(data): try: # what raises a TypeError differs between Python 2 and 3 ordered_ = sorted(data) except TypeError: import ubelt as ub data_ = list(data) sortx = ub.argsort(data_, key=str) ordered_ = [data_[k] for k in sortx] hashable = b''.join(_hashable_sequence(ordered_, extensions=self)) prefix = b'SET' return prefix, hashable
def find_unused_gpu(min_memory=0): """ Finds GPU with the lowest memory usage by parsing output of nvidia-smi Args: min_memory (int): disregards GPUs with fewer than `min_memory` free MB Returns: int or None: gpu num if a match is found otherwise None CommandLine: python -c "from netharn import device; print(device.find_unused_gpu(300))" CUDA_VISIBLE_DEVICES=1; python -c "from netharn import device; print(device.find_unused_gpu(300))" Example: >>> if torch.cuda.is_available(): >>> item = find_unused_gpu() >>> assert item is None or isinstance(item, int) """ # Notes on slurm: # If we are running in slurm, then we should be able to see these # environment vars # SLURM_STEP_GPUS # GPU_DEVICE_ORDINAL # Also respect CUDA_VISIBLE_DEVICES try: gpus = gpu_info() except NvidiaSMIError: gpus = None if not gpus: return None # Order GPUs by most available memory # gpu_avail_mem = {n: -gpu['mem_avail'] for n, gpu in gpus.items()} # Order GPUs by fewest compute processes, and then by available memory gpu_avail_mem = { n: (gpu['num_compute_procs'], -gpu['mem_avail']) for n, gpu in gpus.items() } ranked_order = ub.argsort(gpu_avail_mem) for gpu_num in ranked_order: gpu = gpus[gpu_num] if gpu['mem_avail'] >= min_memory: return gpu_num return None
def images_with_keypoints(): keypoint_gids = set() for aid, ann in merged.anns.items(): if ann['roi_shape'] == 'keypoints': keypoint_gids.add(ann['image_id']) relevant = ub.dict_subset(merged.gid_to_aids, keypoint_gids) relevant = { gid: [a for a in aids if merged.anns[a]['roi_shape'] == 'keypoints'] for gid, aids in relevant.items() } gid_list = ub.argsort(ub.map_vals(len, relevant))[::-1] return gid_list
def _convert_dict(data): try: ordered_ = sorted(data.items()) # what raises a TypeError differs between Python 2 and 3 except TypeError: import ubelt as ub sortx = ub.argsort(data, key=str) ordered_ = [(k, data[k]) for k in sortx] # See: [util_hash.Note.1] hashable = b''.join( _hashable_sequence( ordered_, extensions=self, types=_COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT)) prefix = b'DICT' return prefix, hashable
def find_unused_gpu(min_memory=0): """ Finds GPU with the lowest memory usage by parsing output of nvidia-smi python -c "from pysseg.util import gpu_util; print(gpu_util.find_unused_gpu())" """ gpus = gpu_info() if gpus is None: return None gpu_avail_mem = {n: gpu['mem_avail'] for n, gpu in gpus.items()} usage_order = ub.argsort(gpu_avail_mem) gpu_num = usage_order[-1] if gpu_avail_mem[gpu_num] < min_memory: return None else: return gpu_num
def _convert_set(data): try: # what raises a TypeError differs between Python 2 and 3 ordered_ = sorted(data) except TypeError: import ubelt as ub data_ = list(data) sortx = ub.argsort(data_, key=str) ordered_ = [data_[k] for k in sortx] # See: [util_hash.Note.1] hashable = b''.join( _hashable_sequence( ordered_, extensions=self, types=_COMPATIBLE_HASHABLE_SEQUENCE_TYPES_DEFAULT)) prefix = b'SET' return prefix, hashable
def read_tensorboard_scalars(train_dpath, verbose=1, cache=1): """ Reads all tensorboard scalar events in a directory. Caches them becuase reading events of interest from protobuf can be slow. """ import glob from os.path import join try: from tensorboard.backend.event_processing import event_accumulator except ImportError: raise ImportError('tensorboard is not installed') event_paths = sorted(glob.glob(join(train_dpath, 'events.out.tfevents*'))) # make a hash so we will re-read of we need to cfgstr = ub.hash_data(list(map(ub.hash_file, event_paths))) # cfgstr = ub.hash_data(list(map(basename, event_paths))) cacher = ub.Cacher('tb_scalars', enabled=cache, dpath=ub.ensuredir((train_dpath, '_cache')), cfgstr=cfgstr) datas = cacher.tryload() if datas is None: datas = {} for p in ub.ProgIter(list(reversed(event_paths)), desc='read tensorboard', enabled=verbose): ea = event_accumulator.EventAccumulator(p) ea.Reload() for key in ea.scalars.Keys(): if key not in datas: datas[key] = {'xdata': [], 'ydata': [], 'wall': []} subdatas = datas[key] events = ea.scalars.Items(key) for e in events: subdatas['xdata'].append(int(e.step)) subdatas['ydata'].append(float(e.value)) subdatas['wall'].append(float(e.wall_time)) # Order all information by its wall time for key, subdatas in datas.items(): sortx = ub.argsort(subdatas['wall']) for d, vals in subdatas.items(): subdatas[d] = list(ub.take(vals, sortx)) cacher.save(datas) return datas
def main(): import ubelt as ub from ubelt import util_list from ubelt.util_list import take import random from math import e # # Data N = 100 array = [random.random() for _ in range(N)] indices = [random.randint(0, N - 1) for _ in range(int(N // e))] ti = ub.Timerit(2 ** 11, bestof=2 ** 8, verbose=1) for timer in ti.reset('take'): with timer: list(take(array, indices)) for timer in ti.reset('util_list.take'): with timer: list(util_list.take(array, indices)) for timer in ti.reset('ub.take'): with timer: list(ub.take(array, indices)) print('---') # import pandas as pd # df = pd.DataFrame(rankings) # print('df =\n{}'.format(df)) print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def best_epochs(monitor): rankings = {} def _rank(key, metrics, type='min'): values = [m[key] for m in metrics] sortx = np.argsort(values) if type == 'max': sortx = np.argsort(values)[::-1] elif type == 'min': sortx = np.argsort(values) else: raise KeyError(type) ranked_epochs = np.array(monitor.epochs)[sortx] return ranked_epochs for key in monitor.min_keys: rankings[key + '_raw'] = _rank(key, monitor.raw_metrics, 'min') rankings[key + '_smooth'] = _rank(key, monitor.smooth_metrics, 'min') for key in monitor.max_keys: rankings[key + '_raw'] = _rank(key, monitor.raw_metrics, 'max') rankings[key + '_smooth'] = _rank(key, monitor.smooth_metrics, 'max') for key in monitor.max_keys: values = [m[key] for m in monitor.raw_metrics] sortx = np.argsort(values)[::-1] ranked_epochs = np.array(monitor.epochs)[sortx] rankings[key] = ranked_epochs # borda-like weighted rank aggregation. # probably could do something better. epoch_to_weight = ub.ddict(lambda: 0) for key, ranking in rankings.items(): # weights = np.linspace(0, 1, num=len(ranking))[::-1] weights = np.logspace(0, 2, num=len(ranking))[::-1] / 100 for epoch, w in zip(ranking, weights): epoch_to_weight[epoch] += w agg_ranking = ub.argsort(epoch_to_weight)[::-1] return agg_ranking
def sort_entries(bibman): def freq_group(items, groupids): groups = ub.group_items(items, groupids) hist = ub.map_vals(len, groups) for k in ub.argsort(hist): yield groups[k] high_level_alias = { 'incollection': 'book', 'conference': 'confjourn', 'journal': 'confjourn', 'online-journal': 'confjourn', } sorted_entries = [] entries = list(bibman.cleaned.values()) groups = [ high_level_alias.get(entry['pub_type'], entry['pub_type']) for entry in entries ] entry_groups = freq_group(entries, groups) for group in entry_groups: subids = [entry['ENTRYTYPE'] for entry in group] for subgroup in freq_group(group, subids): subsubids = [entry['pub_full'] for entry in subgroup] # Group publications, and then sort conferences by max date pub_groups = [] pub_maxdates = [] for ssg in freq_group(subgroup, subsubids): sssid = [(entry['date']) for entry in ssg] ssg2 = list(ub.take(ssg, ub.argsort(sssid))) pub_groups.append(ssg2) pub_maxdates.append(ssg2[-1]['date']) subgroup2 = list( ub.flatten(ut.sortedby2(pub_groups, pub_maxdates))) sorted_entries.extend(subgroup2) new_entries = ub.odict([(e['ID'], e) for e in sorted_entries]) [e['pub_type'] for e in sorted_entries] bibman.cleaned = new_entries
def benchmark_hash_file(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --show python ~/code/ubelt/dev/bench_hash.py --show """ import ubelt as ub import random # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) dpath = ub.ensuredir(ub.expandpath('$HOME/tmp')) rng = random.Random(0) # Create a pool of random chunks of data chunksize = int(2 ** 20) pool_size = 8 part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)] #ITEM = 'JUST A STRING' * 100 HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 10)) import os results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2 ** s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) # Write a big file size_pool = [N] fpath = _write_random_file(dpath, part_pool, size_pool, rng) megabytes = os.stat(fpath).st_size / (2 ** 20) print('megabytes = {!r}'.format(megabytes)) for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_file(fpath, hasher=hasher) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds') kwplot.show_if_requested()
def group_indices(idx_to_groupid, assume_sorted=False): """ Find unique items and the indices at which they appear in an array. A common use case of this function is when you have a list of objects (often numeric but sometimes not) and an array of "group-ids" corresponding to that list of objects. Using this function will return a list of indices that can be used in conjunction with :func:`apply_grouping` to group the elements. This is most useful when you have many lists (think column-major data) corresponding to the group-ids. In cases where there is only one list of objects or knowing the indices doesn't matter, then consider using func:`group_items` instead. Args: idx_to_groupid (ndarray): The input array, where each item is interpreted as a group id. For the fastest runtime, the input array must be numeric (ideally with integer types). If the type is non-numeric then the less efficient :func:`ubelt.group_items` is used. assume_sorted (bool, default=False): If the input array is sorted, then setting this to True will avoid an unnecessary sorting operation and improve efficiency. Returns: Tuple[ndarray, List[ndarrays]]: (keys, groupxs) - keys (ndarray): The unique elements of the input array in order groupxs (List[ndarray]): Corresponding list of indexes. The i-th item is an array indicating the indices where the item ``key[i]`` appeared in the input array. Example: >>> # xdoctest: +IGNORE_WHITESPACE >>> import ubelt as ub >>> idx_to_groupid = np.array([2, 1, 2, 1, 2, 1, 2, 3, 3, 3, 3]) >>> (keys, groupxs) = group_indices(idx_to_groupid) >>> print(ub.repr2(keys, with_dtype=False)) >>> print(ub.repr2(groupxs, with_dtype=False)) np.array([1, 2, 3]) [ np.array([1, 3, 5]), np.array([0, 2, 4, 6]), np.array([ 7, 8, 9, 10]), ] Example: >>> # xdoctest: +IGNORE_WHITESPACE >>> import ubelt as ub >>> idx_to_groupid = np.array([[ 24], [ 129], [ 659], [ 659], [ 24], ... [659], [ 659], [ 822], [ 659], [ 659], [24]]) >>> # 2d arrays must be flattened before coming into this function so >>> # information is on the last axis >>> (keys, groupxs) = group_indices(idx_to_groupid.T[0]) >>> print(ub.repr2(keys, with_dtype=False)) >>> print(ub.repr2(groupxs, with_dtype=False)) np.array([ 24, 129, 659, 822]) [ np.array([ 0, 4, 10]), np.array([1]), np.array([2, 3, 5, 6, 8, 9]), np.array([7]), ] Example: >>> # xdoctest: +IGNORE_WHITESPACE >>> import ubelt as ub >>> idx_to_groupid = np.array([True, True, False, True, False, False, True]) >>> (keys, groupxs) = group_indices(idx_to_groupid) >>> print(ub.repr2(keys, with_dtype=False)) >>> print(ub.repr2(groupxs, with_dtype=False)) np.array([False, True]) [ np.array([2, 4, 5]), np.array([0, 1, 3, 6]), ] Example: >>> # xdoctest: +IGNORE_WHITESPACE >>> import ubelt as ub >>> idx_to_groupid = [('a', 'b'), ('d', 'b'), ('a', 'b'), ('a', 'b')] >>> (keys, groupxs) = group_indices(idx_to_groupid) >>> print(ub.repr2(keys, with_dtype=False)) >>> print(ub.repr2(groupxs, with_dtype=False)) [ ('a', 'b'), ('d', 'b'), ] [ np.array([0, 2, 3]), np.array([1]), ] """ _idx_to_groupid_orig = idx_to_groupid idx_to_groupid = np.array(idx_to_groupid, copy=False) _n_item = idx_to_groupid.size _dtype = idx_to_groupid.dtype _kind = _dtype.kind if _kind == 'U' or _kind == 'O': # fallback to slower algorithm for non-numeric data group = ub.group_items(range(_n_item), _idx_to_groupid_orig) try: # attempt to return values in a consistant order sortx = ub.argsort(list(group.keys())) keys = list(ub.take(list(group.keys()), sortx)) groupxs = list(ub.take(list(map(np.array, group.values())), sortx)) except Exception: keys = list(group.keys()) groupxs = list(map(np.array, group.values())) return keys, groupxs # Sort items and idx_to_groupid by groupid if assume_sorted: sortx = np.arange(len(idx_to_groupid)) groupids_sorted = idx_to_groupid else: sortx = idx_to_groupid.argsort() groupids_sorted = idx_to_groupid.take(sortx) if _kind == 'b': # Ensure bools are internally cast to integers # However, be sure that the groups are returned as the original dtype _groupids = groupids_sorted.astype(np.int8) else: _groupids = groupids_sorted # Find the boundaries between groups diff = np.ones(_n_item + 1, _groupids.dtype) np.subtract(_groupids[1:], _groupids[:-1], out=diff[1:_n_item]) idxs = np.flatnonzero(diff) # Groups are between bounding indexes groupxs = [sortx[lx:rx] for lx, rx in zip(idxs, idxs[1:])] # 34.5% # Unique group keys keys = groupids_sorted[idxs[:-1]] return keys, groupxs
def 数组_排序索引(indexable, key=None, reverse=False): return ub.argsort(indexable, key, reverse)
def _precompute_class_weights(dset, mode='median-idf'): """ Example: >>> # xdoctest: +REQUIRES(--download) >>> import sys, ubelt >>> sys.path.append(ubelt.expandpath('~/code/netharn/examples')) >>> from sseg_camvid import * # NOQA >>> harn = setup_harn(0, workers=0, xpu='cpu').initialize() >>> dset = harn.datasets['train'] """ assert mode in ['median-idf', 'log-median-idf'] total_freq = _cached_class_frequency(dset) def logb(arr, base): if base == 'e': return np.log(arr) elif base == 2: return np.log2(arr) elif base == 10: return np.log10(arr) else: out = np.log(arr) out /= np.log(base) return out _min, _max = np.percentile(total_freq, [5, 95]) is_valid = (_min <= total_freq) & (total_freq <= _max) if np.any(is_valid): middle_value = np.median(total_freq[is_valid]) else: middle_value = np.median(total_freq) # variant of median-inverse-frequency nonzero_freq = total_freq[total_freq != 0] if len(nonzero_freq): total_freq[total_freq == 0] = nonzero_freq.min() / 2 if mode == 'median-idf': weights = (middle_value / total_freq) weights[~np.isfinite(weights)] = 1.0 elif mode == 'log-median-idf': weights = (middle_value / total_freq) weights[~np.isfinite(weights)] = 1.0 base = 2 base = np.exp(1) weights = logb(weights + (base - 1), base) weights = np.maximum(weights, .1) weights = np.minimum(weights, 10) else: raise KeyError('mode = {!r}'.format(mode)) weights = np.round(weights, 2) cname_to_weight = ub.dzip(dset.classes, weights) print('weights: ' + ub.repr2(cname_to_weight)) if False: # Inspect the weights import kwplot kwplot.autoplt() cname_to_weight = ub.dzip(dset.classes, weights) cname_to_weight = ub.dict_subset(cname_to_weight, ub.argsort(cname_to_weight)) kwplot.multi_plot( ydata=list(cname_to_weight.values()), kind='bar', xticklabels=list(cname_to_weight.keys()), xtick_rotation=90, fnum=2, doclf=True) return weights
def main(): candidates = None mode = 'ultra' if mode == 'great': candidate_csv_text = ub.codeblock( ''' registeel,LOCK_ON,FLASH_CANNON,FOCUS_BLAST,22,10,14,15 stunfisk_galarian,MUD_SHOT,ROCK_SLIDE,EARTHQUAKE,25,11,14,14 # altaria,DRAGON_BREATH,SKY_ATTACK,DRAGON_PULSE,26.5,14,12,13 skarmory,AIR_SLASH,SKY_ATTACK,FLASH_CANNON,26,11,13,10 azumarill,BUBBLE,ICE_BEAM,HYDRO_PUMP,38,12,15,13 dewgong,ICE_SHARD,ICY_WIND,WATER_PULSE,26.5,15,08,15 # umbreon,SNARL,FOUL_PLAY,LAST_RESORT,24.5,15,10,15 # farfetchd_galarian,FURY_CUTTER,LEAF_BLADE,BRAVE_BIRD,33.5,12,15,15 hypno,CONFUSION,SHADOW_BALL,THUNDER_PUNCH,25.5,13,15,14 # hypno,CONFUSION,SHADOW_BALL,FOCUS_BLAST,25.5,13,15,14 # machamp-shadow,COUNTER,ROCK_SLIDE,CROSS_CHOP,18,5,11,10 victreebel_shadow-shadow,RAZOR_LEAF,LEAF_BLADE,FRUSTRATION,22.5,4,14,14 ''') candidate_explicit = [ Pokemon('medicham', ivs=[7, 15, 14], level=41.5), Pokemon('medicham', ivs=[7, 15, 14], level=43.0), Pokemon('medicham', ivs=[7, 15, 14]).maximize(1500), Pokemon('machamp', [1, 15, 6], cp=1493), Pokemon('altaria', [1, 11, 8], cp=1496), Pokemon('skarmory', [0, 15, 13], cp=1495), Pokemon('umbreon', [1, 8, 8], cp=1495), Pokemon('registeel', [10, 14, 15], cp=1487), Pokemon('stunfisk', [11, 14, 14], form='Galarian', cp=1498), Pokemon('cresselia', [7, 14, 8], cp=1493), Pokemon('vigoroth', [0, 10, 9], cp=1495), Pokemon('drifblim', [4, 14, 13], cp=1498), Pokemon('haunter', [6, 13, 15], cp=1498), Pokemon('mantine', [6, 13, 14], cp=1497), Pokemon('politoed', [3, 5, 13], cp=1493), Pokemon('charizard', [3, 15, 14], cp=1485), Pokemon('gengar', [5, 11, 14], cp=1483), Pokemon('mew', [15, 12, 11], cp=1470), Pokemon('dewgong', [15, 8, 15]).maximize(1500), Pokemon('azumarill', [12, 15, 13]).maximize(1500), Pokemon('hypno', [13, 15, 14]).maximize(1500), ] for cand in candidate_explicit: cand.populate_cp() stat_products = [cand.stat_product for cand in candidate_explicit] sortx = ub.argsort(stat_products) candidate_explicit = list(ub.take(candidate_explicit, sortx)) stat_products = list(ub.take(stat_products, sortx)) print('stat_products = {}'.format(ub.repr2(stat_products, nl=1))) print('candidate_explicit = {}'.format(ub.repr2(candidate_explicit, nl=1))) for cand in candidate_explicit: print('cand.adjusted = {}, {:.2f}, {}'.format(ub.repr2(cand.adjusted, nl=0, precision=2), cand.stat_product, cand)) if mode == 'ultra': candidate_csv_text = ub.codeblock( ''' cresselia,PSYCHO_CUT,MOONBLAST,FUTURE_SIGHT togekiss,CHARM,FLAMETHROWER,ANCIENT_POWER articuno,ICE_SHARD,ICY_WIND,HURRICANE swampert,MUD_SHOT,MUDDY_WATER,EARTHQUAKE venusaur,VINE_WHIP,FRENZY_PLANT,SLUDGE_BOMB ''') candidates = [ Pokemon('Gengar', (7, 14, 14), cp=2500, moves=['SHADOW_CLAW', 'SHADOW_PUNCH', 'SHADOW_BALL']), Pokemon('Togekiss', (15, 15, 14), cp=2469, moves=['CHARM', 'FLAMETHROWER', 'AERIAL_ACE']), Pokemon('Venusaur', (15, 13, 13), cp=2482, moves=['VINE_WHIP', 'FRENZY_PLANT', 'SLUDGE_BOMB']), Pokemon('Muk', (9, 7, 4), cp=2486, form='Alola', moves=['SNARL', 'DARK_PULSE', 'SLUDGE_WAVE']), Pokemon('Swampert', (0, 2, 14), cp=2500, moves=['WATER_GUN', 'HYDRO_CANNON', 'SLUDGE_WAVE']), Pokemon('Empoleon', (0, 10, 14), cp=2495, moves=['WATERFALL', 'HYDRO_CANNON', 'DRILL_PECK']), Pokemon('sirfetch’d', (4, 11, 12), cp=2485, form='Galarian', moves=['COUNTER', 'CLOSE_COMBAT', 'LEAF_BLADE']), ] # else: # raise KeyError(mode) if candidates is None: candidates = [] for line in candidate_csv_text.split('\n'): line = line.strip() if line.startswith('#'): continue if line: row = line.split(',') cand = Pokemon.from_pvpoke_row(row) candidates.append(cand) print(ub.repr2(api.learnable)) if mode == 'ultra': base = 'https://pvpoke.com/team-builder/all/2500' base = 'https://pvpoke.com/team-builder/premier/2500' elif mode == 'great': base = 'https://pvpoke.com/team-builder/all/1500' sep = '%2C' import itertools as it print('candidates = {!r}'.format(candidates)) for team in it.combinations(candidates, 3): # if not any('registeel' in p.name for p in team): # continue # if not any('victree' in p.name for p in team): # continue # if len(set(p.name for p in team)) != 3: # continue suffix = sep.join([p.to_pvpoke_url() for p in team]) url = base + '/' + suffix print(url)
def argparse(self, parser=None, special_options=False): """ construct or update an argparse.ArgumentParser CLI parser Args: parser (None | argparse.ArgumentParser): if specified this parser is updated with options from this config. special_options (bool, default=False): adds special scriptconfig options, namely: --config, --dumps, and --dump. Returns: argparse.ArgumentParser : a new or updated argument parser CommandLine: xdoctest -m scriptconfig.config Config.argparse:0 xdoctest -m scriptconfig.config Config.argparse:1 TODO: A good CLI spec for lists might be # In the case where ``key`` ends with and ``=``, assume the list is # given as a comma separated string with optional square brakets at # each end. --key=[f] # In the case where ``key`` does not end with equals and we know # the value is supposd to be a list, then we consume arguments # until we hit the next one that starts with '--' (which means # that list items cannot start with -- but they can contains # commas) FIXME: * In the case where we have an nargs='+' action, and we specify the option with an `=`, and then we give position args after it there is no way to modify behavior of the action to just look at the data in the string without modifying the ArgumentParser itself. The action object has no control over it. For example `--foo=bar baz biz` will parse as `[baz, biz]` which is really not what we want. We may be able to overload ArgumentParser to fix this. Example: >>> # You can now make instances of this class >>> import scriptconfig >>> self = scriptconfig.Config.demo() >>> parser = self.argparse() >>> parser.print_help() >>> # xdoctest: +REQUIRES(PY3) >>> # Python2 argparse does a hard sys.exit instead of raise >>> ns, extra = parser.parse_known_args() Example: >>> # You can now make instances of this class >>> import scriptconfig as scfg >>> class MyConfig(scfg.Config): >>> description = 'my CLI description' >>> default = { >>> 'path1': scfg.Value(None, position=1, alias='src'), >>> 'path2': scfg.Value(None, position=2, alias='dst'), >>> 'dry': scfg.Value(False, isflag=True), >>> 'approx': scfg.Value(False, isflag=False, alias=['a1', 'a2']), >>> } >>> self = MyConfig() >>> special_options = True >>> parser = None >>> parser = self.argparse(special_options=special_options) >>> parser.print_help() >>> self._read_argv(argv=['objection', '42', '--path1=overruled!']) >>> print('self = {!r}'.format(self)) Ignore: >>> self._read_argv(argv=['hi','--path1=foobar']) >>> self._read_argv(argv=['hi', 'hello', '--path1=foobar']) >>> self._read_argv(argv=['hi', 'hello', '--path1=foobar', '--help']) >>> self._read_argv(argv=['--path1=foobar', '--path1=baz']) >>> print('self = {!r}'.format(self)) """ import argparse if parser is None: parserkw = self._parserkw() parser = argparse.ArgumentParser(**parserkw) # Use custom action used to mark which values were explicitly set on # the commandline parser._explicitly_given = set() parent = self class ParseAction(argparse.Action): def __init__(self, *args, **kwargs): super(ParseAction, self).__init__(*args, **kwargs) # with script config nothing should be required by default all # positional arguments should have keyword arg variants Setting # required=False here will prevent positional args from # erroring if they are not specified. I dont think there are # other side effects, but we should make sure that is actually # the case. self.required = False if self.type is None: # Is this the right place to put this? def _mytype(value): key = self.dest template = parent.default[key] if not isinstance(template, Value): # smartcast non-valued params from commandline value = smartcast.smartcast(value) else: value = template.cast(value) return value self.type = _mytype # print('self.type = {!r}'.format(self.type)) def __call__(action, parser, namespace, values, option_string=None): # print('CALL action = {!r}'.format(action)) # print('option_string = {!r}'.format(option_string)) # print('values = {!r}'.format(values)) if isinstance(values, list) and len(values): # We got a list of lists, which we hack into a flat list if isinstance(values[0], list): import itertools as it values = list(it.chain(*values)) setattr(namespace, action.dest, values) parser._explicitly_given.add(action.dest) # IRC: this ensures each key has a real Value class _metadata = { key: self._data[key] for key, value in self._default.items() if isinstance(self._data[key], Value) } # :type: Dict[str, Value] _positions = {k: v.position for k, v in _metadata.items() if v.position is not None} if _positions: if ub.find_duplicates(_positions.values()): raise Exception('two values have the same position') _keyorder = ub.oset(ub.argsort(_positions)) _keyorder |= (ub.oset(self._default) - _keyorder) else: _keyorder = list(self._default.keys()) def _add_arg(parser, name, key, argkw, positional, isflag, isalias): _argkw = argkw.copy() if isalias: _argkw['help'] = 'alias of {}'.format(key) _argkw.pop('default', None) # flags cannot have flag aliases isflag = False elif positional: parser.add_argument(name, **_argkw) if isflag: # Can we support both flag and setitem methods of cli # parsing? if not isinstance(_argkw.get('default', None), bool): raise ValueError('can only use isflag with bools') _argkw.pop('type', None) _argkw.pop('choices', None) _argkw.pop('action', None) _argkw.pop('nargs', None) _argkw['dest'] = key _argkw_true = _argkw.copy() _argkw_true['action'] = 'store_true' _argkw_false = _argkw.copy() _argkw_false['action'] = 'store_false' _argkw_false.pop('help', None) parser.add_argument('--' + name, **_argkw_true) parser.add_argument('--no-' + name, **_argkw_false) else: parser.add_argument('--' + name, **_argkw) mode = 1 alias_registry = [] for key, value in self._data.items(): # key: str # value: Any | Value argkw = {} argkw['help'] = '' positional = None isflag = False if key in _metadata: # Use the metadata in the Value class to enhance argparse _value = _metadata[key] argkw.update(_value.parsekw) value = _value.value isflag = _value.isflag positional = _value.position else: _value = value if isinstance(value, Value) else None if not argkw['help']: argkw['help'] = '<undocumented>' argkw['default'] = value argkw['action'] = ParseAction name = key _add_arg(parser, name, key, argkw, positional, isflag, isalias=False) if _value is not None: if _value.alias: alts = _value.alias alts = alts if ub.iterable(alts) else [alts] for alias in alts: tup = (alias, key, argkw) alias_registry.append(tup) if mode == 0: name = alias _add_arg(parser, name, key, argkw, positional, isflag, isalias=True) if mode == 1: for tup in alias_registry: (alias, key, argkw) = tup name = alias dest = key _add_arg(parser, name, dest, argkw, positional, isflag, isalias=True) if special_options: parser.add_argument('--config', default=None, help=ub.codeblock( ''' special scriptconfig option that accepts the path to a on-disk configuration file, and loads that into this {!r} object. ''').format(self.__class__.__name__)) parser.add_argument('--dump', default=None, help=ub.codeblock( ''' If specified, dump this config to disk. ''').format(self.__class__.__name__)) parser.add_argument('--dumps', action='store_true', help=ub.codeblock( ''' If specified, dump this config stdout ''').format(self.__class__.__name__)) return parser
def benchmark_hash_data(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --convert=True --show python ~/code/ubelt/dev/bench_hash.py --convert=False --show """ import ubelt as ub #ITEM = 'JUST A STRING' * 100 ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4] HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 13)) results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 convert = ub.argval('--convert', default='True').lower() == 'True' print('convert = {!r}'.format(convert)) ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2**s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) data = [ITEM] * N for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_data(data, hasher=hasher, convert=convert) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print('convert = {!r}'.format(convert)) print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds', title='convert = {}'.format(convert)) kwplot.show_if_requested()
def freq_group(items, groupids): groups = ub.group_items(items, groupids) hist = ub.map_vals(len, groups) for k in ub.argsort(hist): yield groups[k]