def consolodate_duplicates(self): fnames = map(basename, self.rel_fpath_list) duplicate_map = ut.find_duplicate_items(fnames) groups = [] for dupname, idxs in duplicate_map.items(): uuids = self.get_prop('uuids', idxs) unique_uuids, groupxs = ut.group_indices(uuids) groups.extend(ut.apply_grouping(idxs, groupxs)) multitons = [g for g in groups if len(g) > 1] # singletons = [g for g in groups if len(g) <= 1] ut.unflat_take(list(self.fpaths()), multitons)
def get_match_results(depc, qaid_list, daid_list, score_list, config): """ converts table results into format for ipython notebook """ #qaid_list, daid_list = request.get_parent_rowids() #score_list = request.score_list #config = request.config unique_qaids, groupxs = ut.group_indices(qaid_list) #grouped_qaids_list = ut.apply_grouping(qaid_list, groupxs) grouped_daids = ut.apply_grouping(daid_list, groupxs) grouped_scores = ut.apply_grouping(score_list, groupxs) ibs = depc.controller unique_qnids = ibs.get_annot_nids(unique_qaids) # FIXME: decision should not be part of the config for the one-vs-one # scores decision_func = getattr(np, config['decision']) _iter = zip(unique_qaids, unique_qnids, grouped_daids, grouped_scores) for qaid, qnid, daids, scores in _iter: dnids = ibs.get_annot_nids(daids) # Remove distance to self annot_scores = np.array(scores) daid_list_ = np.array(daids) dnid_list_ = np.array(dnids) is_valid = (daid_list_ != qaid) daid_list_ = daid_list_.compress(is_valid) dnid_list_ = dnid_list_.compress(is_valid) annot_scores = annot_scores.compress(is_valid) # Hacked in version of creating an annot match object match_result = ibeis.AnnotMatch() match_result.qaid = qaid match_result.qnid = qnid match_result.daid_list = daid_list_ match_result.dnid_list = dnid_list_ match_result._update_daid_index() match_result._update_unique_nid_index() grouped_annot_scores = vt.apply_grouping(annot_scores, match_result.name_groupxs) name_scores = np.array( [decision_func(dists) for dists in grouped_annot_scores]) match_result.set_cannonical_name_score(annot_scores, name_scores) yield match_result
def get_match_results(depc, qaid_list, daid_list, score_list, config): """ converts table results into format for ipython notebook """ #qaid_list, daid_list = request.get_parent_rowids() #score_list = request.score_list #config = request.config unique_qaids, groupxs = ut.group_indices(qaid_list) #grouped_qaids_list = ut.apply_grouping(qaid_list, groupxs) grouped_daids = ut.apply_grouping(daid_list, groupxs) grouped_scores = ut.apply_grouping(score_list, groupxs) ibs = depc.controller unique_qnids = ibs.get_annot_nids(unique_qaids) # FIXME: decision should not be part of the config for the one-vs-one # scores decision_func = getattr(np, config['decision']) _iter = zip(unique_qaids, unique_qnids, grouped_daids, grouped_scores) for qaid, qnid, daids, scores in _iter: dnids = ibs.get_annot_nids(daids) # Remove distance to self annot_scores = np.array(scores) daid_list_ = np.array(daids) dnid_list_ = np.array(dnids) is_valid = (daid_list_ != qaid) daid_list_ = daid_list_.compress(is_valid) dnid_list_ = dnid_list_.compress(is_valid) annot_scores = annot_scores.compress(is_valid) # Hacked in version of creating an annot match object match_result = ibeis.AnnotMatch() match_result.qaid = qaid match_result.qnid = qnid match_result.daid_list = daid_list_ match_result.dnid_list = dnid_list_ match_result._update_daid_index() match_result._update_unique_nid_index() grouped_annot_scores = vt.apply_grouping(annot_scores, match_result.name_groupxs) name_scores = np.array([decision_func(dists) for dists in grouped_annot_scores]) match_result.set_cannonical_name_score(annot_scores, name_scores) yield match_result
def as_parts(self): if self.parts is not None: return self.parts text = self.as_text() top, header, mid, bot = split_tabular(text) colfmt = self._rectify_colfmt() if colfmt is not None: top = '\\begin{tabular}{%s}' % (colfmt, ) if self.theadify: import textwrap width = self.theadify wrapper = textwrap.TextWrapper(width=width, break_long_words=False) header_lines = header.split('\n') new_lines = [] for line in header_lines: line = line.rstrip('\\') headers = [h.strip() for h in line.split('&')] headers = ['\\\\'.join(wrapper.wrap(h)) for h in headers] headers = [ h if h == '{}' else '\\thead{' + h + '}' for h in headers ] line = ' & '.join(headers) + '\\\\' new_lines.append(line) new_header = '\n'.join(new_lines) header = new_header if True: groupxs = self.groupxs # Put midlines between multi index levels if groupxs is None and isinstance(self._data, pd.DataFrame): index = self._data.index if len(index.names) == 2 and len(mid) == 1: groupxs = ut.group_indices(index.labels[0])[1] # part = '\n\multirow{%d}{*}{%s}\n' % (len(chunk), key,) # part += '\n'.join(['& ' + c for c in chunk]) if groupxs is not None: bodylines = mid[0].split('\n') mid = ut.apply_grouping(bodylines, groupxs) parts = (top, header, mid, bot) return parts
def stratified_label_shuffle_split(y, labels, fractions, y_idx=None, rng=None): """ modified from sklearn to make n splits instaed of 2. Also enforces that labels are not broken into separate groups. Args: y (ndarray): labels labels (?): fractions (?): rng (RandomState): random number generator(default = None) Returns: ?: index_sets CommandLine: python -m ibeis_cnn.dataset stratified_label_shuffle_split --show Example: >>> # DISABLE_DOCTEST >>> from ibeis_cnn.dataset import * # NOQA >>> y = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> labels = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, 7, 7, 7, 7] >>> fractions = [.7, .3] >>> rng = np.random.RandomState(0) >>> index_sets = stratified_label_shuffle_split(y, labels, fractions, rng) """ rng = ut.ensure_rng(rng) #orig_y = y unique_labels, groupxs = ut.group_indices(labels) grouped_ys = ut.apply_grouping(y, groupxs) # Assign each group a probabilistic class unique_ys = [ys[rng.randint(0, len(ys))] for ys in grouped_ys] # TODO: should weight the following selection based on size of group #class_weights = [ut.dict_hist(ys) for ys in grouped_ys] unique_idxs = stratified_shuffle_split(unique_ys, fractions, rng) index_sets = [np.array(ut.flatten(ut.take(groupxs, idxs))) for idxs in unique_idxs] if y_idx is not None: # These indicies subindex into parent set of indicies index_sets = [np.take(y_idx, idxs, axis=0) for idxs in index_sets] return index_sets
def _make_test_folds(self, X, y=None, groups=None): """ Args: self (?): X (ndarray): data y (ndarray): labels(default = None) groups (None): (default = None) Returns: ?: test_folds CommandLine: python -m ibeis.algo.verif.sklearn_utils _make_test_folds Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.verif.sklearn_utils import * # NOQA >>> import utool as ut >>> rng = ut.ensure_rng(0) >>> groups = [1, 1, 3, 4, 2, 2, 7, 8, 8] >>> y = [1, 1, 1, 1, 2, 2, 2, 3, 3] >>> X = np.empty((len(y), 0)) >>> self = StratifiedGroupKFold(random_state=rng) >>> skf_list = list(self.split(X=X, y=y, groups=groups)) """ # if self.shuffle: # rng = check_random_state(self.random_state) # else: # rng = self.random_state n_splits = self.n_splits y = np.asarray(y) n_samples = y.shape[0] import utool as ut # y_counts = bincount(y_inversed) # min_classes_ = np.min(y_counts) # if np.all(self.n_splits > y_counts): # raise ValueError("All the n_groups for individual classes" # " are less than n_splits=%d." # % (self.n_splits)) # if self.n_splits > min_classes_: # warnings.warn(("The least populated class in y has only %d" # " members, which is too few. The minimum" # " number of groups for any class cannot" # " be less than n_splits=%d." # % (min_classes_, self.n_splits)), Warning) unique_y, y_inversed = np.unique(y, return_inverse=True) n_classes = max(unique_y) + 1 unique_groups, group_idxs = ut.group_indices(groups) # grouped_ids = list(grouping.keys()) grouped_y = ut.apply_grouping(y, group_idxs) grouped_y_counts = np.array([ bincount(y_, minlength=n_classes) for y_ in grouped_y]) target_freq = grouped_y_counts.sum(axis=0) target_ratio = target_freq / target_freq.sum() # Greedilly choose the split assignment that minimizes the local # * squared differences in target from actual frequencies # * and best equalizes the number of items per fold # Distribute groups with most members first split_freq = np.zeros((n_splits, n_classes)) # split_ratios = split_freq / split_freq.sum(axis=1) split_ratios = np.ones(split_freq.shape) / split_freq.shape[1] split_diffs = ((split_freq - target_ratio) ** 2).sum(axis=1) sortx = np.argsort(grouped_y_counts.sum(axis=1))[::-1] grouped_splitx = [] for count, group_idx in enumerate(sortx): # print('---------\n') group_freq = grouped_y_counts[group_idx] cand_freq = split_freq + group_freq cand_ratio = cand_freq / cand_freq.sum(axis=1)[:, None] cand_diffs = ((cand_ratio - target_ratio) ** 2).sum(axis=1) # Compute loss losses = [] # others = np.nan_to_num(split_diffs) other_diffs = np.array([ sum(split_diffs[x + 1:]) + sum(split_diffs[:x]) for x in range(n_splits) ]) # penalize unbalanced splits ratio_loss = other_diffs + cand_diffs # penalize heavy splits freq_loss = split_freq.sum(axis=1) freq_loss = freq_loss / freq_loss.sum() losses = ratio_loss + freq_loss # print('group_freq = %r' % (group_freq,)) # print('freq_loss = %s' % (ut.repr2(freq_loss, precision=2),)) # print('ratio_loss = %s' % (ut.repr2(ratio_loss, precision=2),)) #------- splitx = np.argmin(losses) # print('losses = %r, splitx=%r' % (losses, splitx)) split_freq[splitx] = cand_freq[splitx] split_ratios[splitx] = cand_ratio[splitx] split_diffs[splitx] = cand_diffs[splitx] grouped_splitx.append(splitx) # if count > 4: # break # else: # print('split_freq = \n' + # ut.repr2(split_freq, precision=2, suppress_small=True)) # print('target_ratio = \n' + # ut.repr2(target_ratio, precision=2, suppress_small=True)) # print('split_ratios = \n' + # ut.repr2(split_ratios, precision=2, suppress_small=True)) # print(ut.dict_hist(grouped_splitx)) # final_ratio_loss = ((split_ratios - target_ratio) ** 2).sum(axis=1) # print('split_freq = \n' + # ut.repr2(split_freq, precision=3, suppress_small=True)) # print('target_ratio = \n' + # ut.repr2(target_ratio, precision=3, suppress_small=True)) # print('split_ratios = \n' + # ut.repr2(split_ratios, precision=3, suppress_small=True)) # print(ut.dict_hist(grouped_splitx)) test_folds = np.empty(n_samples, dtype=np.int) for group_idx, splitx in zip(sortx, grouped_splitx): idxs = group_idxs[group_idx] test_folds[idxs] = splitx return test_folds
def crftest(): """ pip install pyqpbo pip install pystruct http://taku910.github.io/crfpp/#install cd ~/tmp #wget https://drive.google.com/folderview?id=0B4y35FiV1wh7fngteFhHQUN2Y1B5eUJBNHZUemJYQV9VWlBUb3JlX0xBdWVZTWtSbVBneU0&usp=drive_web#list 7z x CRF++-0.58.tar.gz 7z x CRF++-0.58.tar cd CRF++-0.58 chmod +x configure ./configure make """ import pystruct import pystruct.models inference_method_options = ['lp', 'max-product'] inference_method = inference_method_options[1] # graph = pystruct.models.GraphCRF( # n_states=None, # n_features=None, # inference_method=inference_method, # class_weight=None, # directed=False, # ) num_annots = 5 num_names = num_annots aids = np.arange(5) rng = np.random.RandomState(0) hidden_nids = rng.randint(0, num_names, num_annots) unique_nids, groupxs = ut.group_indices(hidden_nids) # Indicator vector indicating the name node_features = np.zeros((num_annots, num_names)) node_features[(aids, hidden_nids)] = 1 toy_params = {True: {'mu': 1.0, 'sigma': 2.2}, False: {'mu': 7.0, 'sigma': 0.9}} if False: import vtool as vt import wbia.plottool as pt pt.ensureqt() xdata = np.linspace(0, 100, 1000) tp_pdf = vt.gauss_func1d(xdata, **toy_params[True]) fp_pdf = vt.gauss_func1d(xdata, **toy_params[False]) pt.plot_probabilities([tp_pdf, fp_pdf], ['TP', 'TF'], xdata=xdata) def metric(aidx1, aidx2, hidden_nids=hidden_nids, toy_params=toy_params): if aidx1 == aidx2: return 0 rng = np.random.RandomState(int(aidx1 + aidx2)) same = hidden_nids[int(aidx1)] == hidden_nids[int(aidx2)] mu, sigma = ut.dict_take(toy_params[same], ['mu', 'sigma']) return np.clip(rng.normal(mu, sigma), 0, np.inf) pairwise_aidxs = list(ut.iprod(range(num_annots), range(num_annots))) pairwise_labels = np.array( # NOQA [hidden_nids[a1] == hidden_nids[a2] for a1, a2 in pairwise_aidxs] ) pairwise_scores = np.array([metric(*zz) for zz in pairwise_aidxs]) pairwise_scores_mat = pairwise_scores.reshape(num_annots, num_annots) # NOQA graph = pystruct.models.EdgeFeatureGraphCRF( # NOQA n_states=num_annots, n_features=num_names, n_edge_features=1, inference_method=inference_method, ) import opengm numVar = 10 unaries = np.ones([numVar, 3], dtype=opengm.value_type) gm = opengm.gm(np.ones(numVar, dtype=opengm.label_type) * 3) unary_fids = gm.addFunctions(unaries) gm.addFactors(unary_fids, np.arange(numVar)) infParam = opengm.InfParam(workflow=ut.ensure_ascii('(IC)(TTC-I,CC-I)')) inf = opengm.inference.Multicut(gm, parameter=infParam) visitor = inf.verboseVisitor(printNth=1, multiline=False) inf.infer(visitor) arg = inf.arg() # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) # regularizer = opengm.pottsFunction([3, 3], 0.0, beta) # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) unaries = np.random.rand(10, 10, 2) potts = opengm.PottsFunction([2, 2], 0.0, 0.4) gm = opengm.grid2d2Order(unaries=unaries, regularizer=potts) inf = opengm.inference.GraphCut(gm) inf.infer() arg = inf.arg() # NOQA
def group_indicies(self, labels): unique_labels, groupxs = ut.group_indices(labels) return unique_labels, groupxs
def collapse_labels(model, evidence, reduced_variables, reduced_row_idxs, reduced_values): import vtool as vt #assert np.all(reduced_joint.values.ravel() == reduced_joint.values.flatten()) reduced_ttypes = [model.var2_cpd[var].ttype for var in reduced_variables] evidence_vars = list(evidence.keys()) evidence_state_idxs = ut.dict_take(evidence, evidence_vars) evidence_ttypes = [model.var2_cpd[var].ttype for var in evidence_vars] ttype2_ev_indices = dict(zip(*ut.group_indices(evidence_ttypes))) ttype2_re_indices = dict(zip(*ut.group_indices(reduced_ttypes))) # ttype2_ev_indices = ut.group_items(range(len(evidence_vars)), evidence_ttypes) # ttype2_re_indices = ut.group_items(range(len(reduced_variables)), reduced_ttypes) # Allow specific types of labels to change # everything is the same, only the names have changed. # TODO: allow for multiple different label_ttypes # for label_ttype in label_ttypes if 'name' not in model.ttype2_template: return reduced_row_idxs, reduced_values label_ttypes = ['name'] for label_ttype in label_ttypes: ev_colxs = ttype2_ev_indices[label_ttype] re_colxs = ttype2_re_indices[label_ttype] ev_state_idxs = ut.take(evidence_state_idxs, ev_colxs) ev_state_idxs_tile = np.tile(ev_state_idxs, (len(reduced_values), 1)).astype(np.int) num_ev_ = len(ev_colxs) aug_colxs = list(range(num_ev_)) + (np.array(re_colxs) + num_ev_).tolist() aug_state_idxs = np.hstack([ev_state_idxs_tile, reduced_row_idxs]) # Relabel rows based on the knowledge that # everything is the same, only the names have changed. num_cols = len(aug_state_idxs.T) mask = vt.index_to_boolmask(aug_colxs, num_cols) other_colxs, = np.where(~mask) relbl_states = aug_state_idxs.compress(mask, axis=1) other_states = aug_state_idxs.compress(~mask, axis=1) tmp_relbl_states = np.array(list(map(make_temp_state, relbl_states))) max_tmp_state = -1 min_tmp_state = tmp_relbl_states.min() # rebuild original state structure with temp state idxs tmp_state_cols = [None] * num_cols for count, colx in enumerate(aug_colxs): tmp_state_cols[colx] = tmp_relbl_states[:, count:count + 1] for count, colx in enumerate(other_colxs): tmp_state_cols[colx] = other_states[:, count:count + 1] tmp_state_idxs = np.hstack(tmp_state_cols) data_ids = np.array( vt.compute_unique_data_ids_(list(map(tuple, tmp_state_idxs)))) unique_ids, groupxs = vt.group_indices(data_ids) print('Collapsed %r states into %r states' % ( len(data_ids), len(unique_ids),)) # Sum the values in the cpd to marginalize the duplicate probs new_values = np.array([ g.sum() for g in vt.apply_grouping(reduced_values, groupxs) ]) # Take only the unique rows under this induced labeling unique_tmp_groupxs = np.array(ut.get_list_column(groupxs, 0)) new_aug_state_idxs = tmp_state_idxs.take(unique_tmp_groupxs, axis=0) tmp_idx_set = set((-np.arange(-max_tmp_state, (-min_tmp_state) + 1)).tolist()) true_idx_set = set(range(len(model.ttype2_template[label_ttype].basis))) # Relabel the rows one more time to agree with initial constraints for colx, true_idx in enumerate(ev_state_idxs): tmp_idx = np.unique(new_aug_state_idxs.T[colx]) assert len(tmp_idx) == 1 tmp_idx_set -= {tmp_idx[0]} true_idx_set -= {true_idx} new_aug_state_idxs[new_aug_state_idxs == tmp_idx] = true_idx # Relabel the remaining idxs remain_tmp_idxs = sorted(list(tmp_idx_set))[::-1] remain_true_idxs = sorted(list(true_idx_set)) for tmp_idx, true_idx in zip(remain_tmp_idxs, remain_true_idxs): new_aug_state_idxs[new_aug_state_idxs == tmp_idx] = true_idx # Remove evidence based augmented labels new_state_idxs = new_aug_state_idxs.T[num_ev_:].T return new_state_idxs, new_values
def stratified_kfold_label_split(y, labels, n_folds=2, y_idx=None, rng=None): """ Also enforces that labels are not broken into separate groups. Args: y (ndarray): labels labels (?): y_idx (array): indexes associated with y if it was already presampled rng (RandomState): random number generator(default = None) Returns: ?: index_sets CommandLine: python -m ibeis_cnn.dataset stratified_label_shuffle_split --show Example: >>> # DISABLE_DOCTEST >>> from ibeis_cnn.dataset import * # NOQA >>> y = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> labels = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, 7, 7, 7, 7] >>> fractions = [.7, .3] >>> rng = np.random.RandomState(0) >>> index_sets = stratified_label_shuffle_split(y, labels, fractions, rng) """ rng = ut.ensure_rng(rng) #orig_y = y unique_labels, groupxs = ut.group_indices(labels) grouped_ys = ut.apply_grouping(y, groupxs) # Assign each group a probabilistic class unique_ys = [ys[rng.randint(0, len(ys))] for ys in grouped_ys] # TODO: should weight the following selection based on size of group #class_weights = [ut.dict_hist(ys) for ys in grouped_ys] import sklearn.cross_validation xvalkw = dict(n_folds=n_folds, shuffle=True, random_state=rng) skf = sklearn.cross_validation.StratifiedKFold(unique_ys, **xvalkw) _iter = skf folded_index_sets = [] for label_idx_set in _iter: index_sets = [np.array(ut.flatten(ut.take(groupxs, idxs))) for idxs in label_idx_set] folded_index_sets.append(index_sets) for train_idx, test_idx in folded_index_sets: train_labels = set(ut.take(labels, train_idx)) test_labels = set(ut.take(labels, test_idx)) assert len(test_labels.intersection(train_labels)) == 0, 'same labels appeared in both train and test' pass if y_idx is not None: # These indicies subindex into parent set of indicies folded_index_sets2 = [] for index_sets in folded_index_sets: index_sets = [np.take(y_idx, idxs, axis=0) for idxs in index_sets] folded_index_sets2.append(index_sets) folded_index_sets = folded_index_sets2 #import sklearn.model_selection #skf = sklearn.model_selection.StratifiedKFold(**xvalkw) #_iter = skf.split(X=np.empty(len(target)), y=target) #unique_idxs = stratified_shuffle_split(unique_ys, fractions, rng) #index_sets = [np.array(ut.flatten(ut.take(groupxs, idxs))) for idxs in unique_idxs] #if idx is not None: # # These indicies subindex into parent set of indicies # index_sets = [np.take(idx, idxs, axis=0) for idxs in index_sets] return folded_index_sets
def collapse_labels(model, evidence, reduced_variables, reduced_row_idxs, reduced_values): import vtool as vt # assert np.all(reduced_joint.values.ravel() == reduced_joint.values.flatten()) reduced_ttypes = [model.var2_cpd[var].ttype for var in reduced_variables] evidence_vars = list(evidence.keys()) evidence_state_idxs = ut.dict_take(evidence, evidence_vars) evidence_ttypes = [model.var2_cpd[var].ttype for var in evidence_vars] ttype2_ev_indices = dict(zip(*ut.group_indices(evidence_ttypes))) ttype2_re_indices = dict(zip(*ut.group_indices(reduced_ttypes))) # ttype2_ev_indices = ut.group_items(range(len(evidence_vars)), evidence_ttypes) # ttype2_re_indices = ut.group_items(range(len(reduced_variables)), reduced_ttypes) # Allow specific types of labels to change # everything is the same, only the names have changed. # TODO: allow for multiple different label_ttypes # for label_ttype in label_ttypes if NAME_TTYPE not in model.ttype2_template: return reduced_row_idxs, reduced_values label_ttypes = [NAME_TTYPE] for label_ttype in label_ttypes: ev_colxs = ttype2_ev_indices[label_ttype] re_colxs = ttype2_re_indices[label_ttype] ev_state_idxs = ut.take(evidence_state_idxs, ev_colxs) ev_state_idxs_tile = np.tile(ev_state_idxs, (len(reduced_values), 1)).astype(np.int) num_ev_ = len(ev_colxs) aug_colxs = list( range(num_ev_)) + (np.array(re_colxs) + num_ev_).tolist() aug_state_idxs = np.hstack([ev_state_idxs_tile, reduced_row_idxs]) # Relabel rows based on the knowledge that # everything is the same, only the names have changed. num_cols = len(aug_state_idxs.T) mask = vt.index_to_boolmask(aug_colxs, num_cols) (other_colxs, ) = np.where(~mask) relbl_states = aug_state_idxs.compress(mask, axis=1) other_states = aug_state_idxs.compress(~mask, axis=1) tmp_relbl_states = np.array(list(map(make_temp_state, relbl_states))) max_tmp_state = -1 min_tmp_state = tmp_relbl_states.min() # rebuild original state structure with temp state idxs tmp_state_cols = [None] * num_cols for count, colx in enumerate(aug_colxs): tmp_state_cols[colx] = tmp_relbl_states[:, count:count + 1] for count, colx in enumerate(other_colxs): tmp_state_cols[colx] = other_states[:, count:count + 1] tmp_state_idxs = np.hstack(tmp_state_cols) data_ids = np.array( vt.compute_unique_data_ids_(list(map(tuple, tmp_state_idxs)))) unique_ids, groupxs = vt.group_indices(data_ids) logger.info('Collapsed %r states into %r states' % ( len(data_ids), len(unique_ids), )) # Sum the values in the cpd to marginalize the duplicate probs new_values = np.array( [g.sum() for g in vt.apply_grouping(reduced_values, groupxs)]) # Take only the unique rows under this induced labeling unique_tmp_groupxs = np.array(ut.get_list_column(groupxs, 0)) new_aug_state_idxs = tmp_state_idxs.take(unique_tmp_groupxs, axis=0) tmp_idx_set = set((-np.arange(-max_tmp_state, (-min_tmp_state) + 1)).tolist()) true_idx_set = set(range(len( model.ttype2_template[label_ttype].basis))) # Relabel the rows one more time to agree with initial constraints for colx, true_idx in enumerate(ev_state_idxs): tmp_idx = np.unique(new_aug_state_idxs.T[colx]) assert len(tmp_idx) == 1 tmp_idx_set -= {tmp_idx[0]} true_idx_set -= {true_idx} new_aug_state_idxs[new_aug_state_idxs == tmp_idx] = true_idx # Relabel the remaining idxs remain_tmp_idxs = sorted(list(tmp_idx_set))[::-1] remain_true_idxs = sorted(list(true_idx_set)) for tmp_idx, true_idx in zip(remain_tmp_idxs, remain_true_idxs): new_aug_state_idxs[new_aug_state_idxs == tmp_idx] = true_idx # Remove evidence based augmented labels new_state_idxs = new_aug_state_idxs.T[num_ev_:].T return new_state_idxs, new_values
def crftest(): """ pip install pyqpbo pip install pystruct http://taku910.github.io/crfpp/#install cd ~/tmp #wget https://drive.google.com/folderview?id=0B4y35FiV1wh7fngteFhHQUN2Y1B5eUJBNHZUemJYQV9VWlBUb3JlX0xBdWVZTWtSbVBneU0&usp=drive_web#list 7z x CRF++-0.58.tar.gz 7z x CRF++-0.58.tar cd CRF++-0.58 chmod +x configure ./configure make """ import pystruct import pystruct.models inference_method_options = ['lp', 'max-product'] inference_method = inference_method_options[1] #graph = pystruct.models.GraphCRF( # n_states=None, # n_features=None, # inference_method=inference_method, # class_weight=None, # directed=False, #) num_annots = 5 num_names = num_annots aids = np.arange(5) rng = np.random.RandomState(0) hidden_nids = rng.randint(0, num_names, num_annots) unique_nids, groupxs = ut.group_indices(hidden_nids) # Indicator vector indicating the name node_features = np.zeros((num_annots, num_names)) node_features[(aids, hidden_nids)] = 1 toy_params = { True: {'mu': 1.0, 'sigma': 2.2}, False: {'mu': 7.0, 'sigma': .9} } if False: import vtool as vt import plottool as pt pt.ensure_pylab_qt4() xdata = np.linspace(0, 100, 1000) tp_pdf = vt.gauss_func1d(xdata, **toy_params[True]) fp_pdf = vt.gauss_func1d(xdata, **toy_params[False]) pt.plot_probabilities([tp_pdf, fp_pdf], ['TP', 'TF'], xdata=xdata) def metric(aidx1, aidx2, hidden_nids=hidden_nids, toy_params=toy_params): if aidx1 == aidx2: return 0 rng = np.random.RandomState(int(aidx1 + aidx2)) same = hidden_nids[int(aidx1)] == hidden_nids[int(aidx2)] mu, sigma = ut.dict_take(toy_params[same], ['mu', 'sigma']) return np.clip(rng.normal(mu, sigma), 0, np.inf) pairwise_aidxs = list(ut.iprod(range(num_annots), range(num_annots))) pairwise_labels = np.array([hidden_nids[a1] == hidden_nids[a2] for a1, a2 in pairwise_aidxs]) pairwise_scores = np.array([metric(*zz) for zz in pairwise_aidxs]) pairwise_scores_mat = pairwise_scores.reshape(num_annots, num_annots) graph = pystruct.models.EdgeFeatureGraphCRF( n_states=num_annots, n_features=num_names, n_edge_features=1, inference_method=inference_method, ) import opengm numVar = 10 unaries = np.ones([numVar, 3], dtype=opengm.value_type) gm = opengm.gm(np.ones(numVar, dtype=opengm.label_type) * 3) unary_fids = gm.addFunctions(unaries) gm.addFactors(unary_fids, np.arange(numVar)) infParam = opengm.InfParam( workflow=ut.ensure_ascii('(IC)(TTC-I,CC-I)'), ) inf = opengm.inference.Multicut(gm, parameter=infParam) visitor = inf.verboseVisitor(printNth=1, multiline=False) inf.infer(visitor) arg = inf.arg() # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) # regularizer = opengm.pottsFunction([3, 3], 0.0, beta) # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) unaries = np.random.rand(10, 10, 2) potts = opengm.PottsFunction([2, 2], 0.0, 0.4) gm = opengm.grid2d2Order(unaries=unaries, regularizer=potts) inf = opengm.inference.GraphCut(gm) inf.infer() arg = inf.arg()
def merge_level_order(level_orders, topsort): """ Merge orders of individual subtrees into a total ordering for computation. >>> level_orders = { >>> 'multi_chip_multitest': [['dummy_annot'], ['chip'], ['multitest'], >>> ['multitest_score'], ], >>> 'multi_fgweight_multitest': [ ['dummy_annot'], ['chip', 'probchip'], >>> ['keypoint'], ['fgweight'], ['multitest'], ['multitest_score'], ], >>> 'multi_keypoint_nnindexer': [ ['dummy_annot'], ['chip'], ['keypoint'], >>> ['nnindexer'], ['multitest'], ['multitest_score'], ], >>> 'normal': [ ['dummy_annot'], ['chip', 'probchip'], ['keypoint'], >>> ['fgweight'], ['spam'], ['multitest'], ['multitest_score'], ], >>> 'nwise_notch_multitest_1': [ ['dummy_annot'], ['notch'], ['multitest'], >>> ['multitest_score'], ], >>> 'nwise_notch_multitest_2': [ ['dummy_annot'], ['notch'], ['multitest'], >>> ['multitest_score'], ], >>> 'nwise_notch_notchpair_1': [ ['dummy_annot'], ['notch'], ['notchpair'], >>> ['multitest'], ['multitest_score'], ], >>> 'nwise_notch_notchpair_2': [ ['dummy_annot'], ['notch'], ['notchpair'], >>> ['multitest'], ['multitest_score'], ], >>> } >>> topsort = [u'dummy_annot', u'notch', u'probchip', u'chip', u'keypoint', >>> u'fgweight', u'nnindexer', u'spam', u'notchpair', u'multitest', >>> u'multitest_score'] >>> print(ut.repr3(ut.merge_level_order(level_orders, topsort))) EG2: level_orders = {u'normal': [[u'dummy_annot'], [u'chip', u'probchip'], [u'keypoint'], [u'fgweight'], [u'spam']]} topsort = [u'dummy_annot', u'probchip', u'chip', u'keypoint', u'fgweight', u'spam'] """ import utool as ut if False: compute_order = [] level_orders = ut.map_dict_vals(ut.total_flatten, level_orders) level_sets = ut.map_dict_vals(set, level_orders) for tablekey in topsort: compute_order.append((tablekey, [groupkey for groupkey, set_ in level_sets.items() if tablekey in set_])) return compute_order else: # Do on common subgraph import itertools # Pointer to current level.: Start at the end and # then work your way up. main_ptr = len(topsort) - 1 stack = [] #from six.moves import zip_longest keys = list(level_orders.keys()) type_to_ptr = {key: -1 for key in keys} print('level_orders = %s' % (ut.repr3(level_orders),)) for count in itertools.count(0): print('----') print('count = %r' % (count,)) ptred_levels = [] for key in keys: levels = level_orders[key] ptr = type_to_ptr[key] try: level = tuple(levels[ptr]) except IndexError: level = None ptred_levels.append(level) print('ptred_levels = %r' % (ptred_levels,)) print('main_ptr = %r' % (main_ptr,)) # groupkeys, groupxs = ut.group_indices(ptred_levels) # Group keys are tablenames # They point to the (type) of the input # num_levelkeys = len(ut.total_flatten(ptred_levels)) groupkeys, groupxs = ut.group_indices(ptred_levels) main_idx = None while main_idx is None and main_ptr >= 0: target = topsort[main_ptr] print('main_ptr = %r' % (main_ptr,)) print('target = %r' % (target,)) # main_idx = ut.listfind(groupkeys, (target,)) # if main_idx is None: possible_idxs = [idx for idx, keytup in enumerate(groupkeys) if keytup is not None and target in keytup] if len(possible_idxs) == 1: main_idx = possible_idxs[0] else: main_idx = None if main_idx is None: main_ptr -= 1 if main_idx is None: print('break I') break found_groups = ut.apply_grouping(keys, groupxs)[main_idx] print('found_groups = %r' % (found_groups,)) stack.append((target, found_groups)) for k in found_groups: type_to_ptr[k] -= 1 if len(found_groups) == len(keys): main_ptr -= 1 if main_ptr < 0: print('break E') break print('stack = %s' % (ut.repr3(stack),)) print('have = %r' % (sorted(ut.take_column(stack, 0)),)) print('need = %s' % (sorted(ut.total_flatten(level_orders.values())),)) compute_order = stack[::-1] return compute_order
def get_injured_sharks(): """ >>> from wbia.scripts.getshark import * # NOQA """ import requests url = 'http://www.whaleshark.org/getKeywordImages.jsp' resp = requests.get(url) assert resp.status_code == 200 keywords = resp.json()['keywords'] key_list = ut.take_column(keywords, 'indexName') key_to_nice = {k['indexName']: k['readableName'] for k in keywords} injury_patterns = [ 'injury', 'net', 'hook', 'trunc', 'damage', 'scar', 'nicks', 'bite', ] injury_keys = [ key for key in key_list if any([pat in key for pat in injury_patterns]) ] noninjury_keys = ut.setdiff(key_list, injury_keys) injury_nice = ut.lmap(lambda k: key_to_nice[k], injury_keys) # NOQA noninjury_nice = ut.lmap(lambda k: key_to_nice[k], noninjury_keys) # NOQA key_list = injury_keys keyed_images = {} for key in ut.ProgIter(key_list, lbl='reading index', bs=True): key_url = url + '?indexName={indexName}'.format(indexName=key) key_resp = requests.get(key_url) assert key_resp.status_code == 200 key_imgs = key_resp.json()['images'] keyed_images[key] = key_imgs key_hist = {key: len(imgs) for key, imgs in keyed_images.items()} key_hist = ut.sort_dict(key_hist, 'vals') logger.info(ut.repr3(key_hist)) nice_key_hist = ut.map_dict_keys(lambda k: key_to_nice[k], key_hist) nice_key_hist = ut.sort_dict(nice_key_hist, 'vals') logger.info(ut.repr3(nice_key_hist)) key_to_urls = { key: ut.take_column(vals, 'url') for key, vals in keyed_images.items() } overlaps = {} import itertools overlap_img_list = [] for k1, k2 in itertools.combinations(key_to_urls.keys(), 2): overlap_imgs = ut.isect(key_to_urls[k1], key_to_urls[k2]) num_overlap = len(overlap_imgs) overlaps[(k1, k2)] = num_overlap overlaps[(k1, k1)] = len(key_to_urls[k1]) if num_overlap > 0: # logger.info('[%s][%s], overlap=%r' % (k1, k2, num_overlap)) overlap_img_list.extend(overlap_imgs) all_img_urls = list(set(ut.flatten(key_to_urls.values()))) num_all = len(all_img_urls) # NOQA logger.info('num_all = %r' % (num_all, )) # Determine super-categories categories = ['nicks', 'scar', 'trunc'] # Force these keys into these categories key_to_cat = {'scarbite': 'other_injury'} cat_to_keys = ut.ddict(list) for key in key_to_urls.keys(): flag = 1 if key in key_to_cat: cat = key_to_cat[key] cat_to_keys[cat].append(key) continue for cat in categories: if cat in key: cat_to_keys[cat].append(key) flag = 0 if flag: cat = 'other_injury' cat_to_keys[cat].append(key) cat_urls = ut.ddict(list) for cat, keys in cat_to_keys.items(): for key in keys: cat_urls[cat].extend(key_to_urls[key]) cat_hist = {} for cat in list(cat_urls.keys()): cat_urls[cat] = list(set(cat_urls[cat])) cat_hist[cat] = len(cat_urls[cat]) logger.info(ut.repr3(cat_to_keys)) logger.info(ut.repr3(cat_hist)) key_to_cat = dict([(val, key) for key, vals in cat_to_keys.items() for val in vals]) # ingestset = { # '__class__': 'ImageSet', # 'images': ut.ddict(dict) # } # for key, key_imgs in keyed_images.items(): # for imgdict in key_imgs: # url = imgdict['url'] # encid = imgdict['correspondingEncounterNumber'] # # Make structure # encdict = encounters[encid] # encdict['__class__'] = 'Encounter' # imgdict = ut.delete_keys(imgdict.copy(), ['correspondingEncounterNumber']) # imgdict['__class__'] = 'Image' # cat = key_to_cat[key] # annotdict = {'relative_bbox': [.01, .01, .98, .98], 'tags': [cat, key]} # annotdict['__class__'] = 'Annotation' # # Ensure structures exist # encdict['images'] = encdict.get('images', []) # imgdict['annots'] = imgdict.get('annots', []) # # Add an image to this encounter # encdict['images'].append(imgdict) # # Add an annotation to this image # imgdict['annots'].append(annotdict) # # http://springbreak.wildbook.org/rest/org.ecocean.Encounter/1111 # get_enc_url = 'http://www.whaleshark.org/rest/org.ecocean.Encounter/%s' % (encid,) # resp = requests.get(get_enc_url) # logger.info(ut.repr3(encdict)) # logger.info(ut.repr3(encounters)) # Download the files to the local disk # fpath_list = all_urls = ut.unique( ut.take_column( ut.flatten( ut.dict_subset(keyed_images, ut.flatten(cat_to_keys.values())).values()), 'url', )) dldir = ut.truepath('~/tmpsharks') from os.path import commonprefix, basename # NOQA prefix = commonprefix(all_urls) suffix_list = [url_[len(prefix):] for url_ in all_urls] fname_list = [suffix.replace('/', '--') for suffix in suffix_list] fpath_list = [] for url, fname in ut.ProgIter(zip(all_urls, fname_list), lbl='downloading imgs', freq=1): fpath = ut.grab_file_url(url, download_dir=dldir, fname=fname, verbose=False) fpath_list.append(fpath) # Make sure we keep orig info # url_to_keys = ut.ddict(list) url_to_info = ut.ddict(dict) for key, imgdict_list in keyed_images.items(): for imgdict in imgdict_list: url = imgdict['url'] info = url_to_info[url] for k, v in imgdict.items(): info[k] = info.get(k, []) info[k].append(v) info['keys'] = info.get('keys', []) info['keys'].append(key) # url_to_keys[url].append(key) info_list = ut.take(url_to_info, all_urls) for info in info_list: if len(set(info['correspondingEncounterNumber'])) > 1: assert False, 'url with two different encounter nums' # Combine duplicate tags hashid_list = [ ut.get_file_uuid(fpath_, stride=8) for fpath_ in ut.ProgIter(fpath_list, bs=True) ] groupxs = ut.group_indices(hashid_list)[1] # Group properties by duplicate images # groupxs = [g for g in groupxs if len(g) > 1] fpath_list_ = ut.take_column(ut.apply_grouping(fpath_list, groupxs), 0) url_list_ = ut.take_column(ut.apply_grouping(all_urls, groupxs), 0) info_list_ = [ ut.map_dict_vals(ut.flatten, ut.dict_accum(*info_)) for info_ in ut.apply_grouping(info_list, groupxs) ] encid_list_ = [ ut.unique(info_['correspondingEncounterNumber'])[0] for info_ in info_list_ ] keys_list_ = [ut.unique(info_['keys']) for info_ in info_list_] cats_list_ = [ut.unique(ut.take(key_to_cat, keys)) for keys in keys_list_] clist = ut.ColumnLists({ 'gpath': fpath_list_, 'url': url_list_, 'encid': encid_list_, 'key': keys_list_, 'cat': cats_list_, }) # for info_ in ut.apply_grouping(info_list, groupxs): # info = ut.dict_accum(*info_) # info = ut.map_dict_vals(ut.flatten, info) # x = ut.unique(ut.flatten(ut.dict_accum(*info_)['correspondingEncounterNumber'])) # if len(x) > 1: # info = info.copy() # del info['keys'] # logger.info(ut.repr3(info)) flags = ut.lmap(ut.fpath_has_imgext, clist['gpath']) clist = clist.compress(flags) import wbia ibs = wbia.opendb('WS_Injury', allow_newdir=True) gid_list = ibs.add_images(clist['gpath']) clist['gid'] = gid_list failed_flags = ut.flag_None_items(clist['gid']) logger.info('# failed %s' % (sum(failed_flags), )) passed_flags = ut.not_list(failed_flags) clist = clist.compress(passed_flags) ut.assert_all_not_None(clist['gid']) # ibs.get_image_uris_original(clist['gid']) ibs.set_image_uris_original(clist['gid'], clist['url'], overwrite=True) # ut.zipflat(clist['cat'], clist['key']) if False: # Can run detection instead clist['tags'] = ut.zipflat(clist['cat']) aid_list = ibs.use_images_as_annotations(clist['gid'], adjust_percent=0.01, tags_list=clist['tags']) aid_list import wbia.plottool as pt from wbia import core_annots pt.qt4ensure() # annots = ibs.annots() # aids = [1, 2] # ibs.depc_annot.get('hog', aids , 'hog') # ibs.depc_annot.get('chip', aids, 'img') for aid in ut.InteractiveIter(ibs.get_valid_aids()): hogs = ibs.depc_annot.d.get_hog_hog([aid]) chips = ibs.depc_annot.d.get_chips_img([aid]) chip = chips[0] hogimg = core_annots.make_hog_block_image(hogs[0]) pt.clf() pt.imshow(hogimg, pnum=(1, 2, 1)) pt.imshow(chip, pnum=(1, 2, 2)) fig = pt.gcf() fig.show() fig.canvas.draw() # logger.info(len(groupxs)) # if False: # groupxs = ut.find_duplicate_items(ut.lmap(basename, suffix_list)).values() # logger.info(ut.repr3(ut.apply_grouping(all_urls, groupxs))) # # FIX # for fpath, fname in zip(fpath_list, fname_list): # if ut.checkpath(fpath): # ut.move(fpath, join(dirname(fpath), fname)) # logger.info('fpath = %r' % (fpath,)) # import wbia # from wbia.dbio import ingest_dataset # dbdir = wbia.sysres.lookup_dbdir('WS_ALL') # self = ingest_dataset.Ingestable2(dbdir) if False: # Show overlap matrix import wbia.plottool as pt import pandas as pd import numpy as np dict_ = overlaps s = pd.Series(dict_, index=pd.MultiIndex.from_tuples(overlaps)) df = s.unstack() lhs, rhs = df.align(df.T) df = lhs.add(rhs, fill_value=0).fillna(0) label_texts = df.columns.values def label_ticks(label_texts): import wbia.plottool as pt truncated_labels = [repr(lbl[0:100]) for lbl in label_texts] ax = pt.gca() ax.set_xticks(list(range(len(label_texts)))) ax.set_xticklabels(truncated_labels) [lbl.set_rotation(-55) for lbl in ax.get_xticklabels()] [ lbl.set_horizontalalignment('left') for lbl in ax.get_xticklabels() ] # xgrid, ygrid = np.meshgrid(range(len(label_texts)), range(len(label_texts))) # pt.plot_surface3d(xgrid, ygrid, disjoint_mat) ax.set_yticks(list(range(len(label_texts)))) ax.set_yticklabels(truncated_labels) [ lbl.set_horizontalalignment('right') for lbl in ax.get_yticklabels() ] [ lbl.set_verticalalignment('center') for lbl in ax.get_yticklabels() ] # [lbl.set_rotation(20) for lbl in ax.get_yticklabels()] # df = df.sort(axis=0) # df = df.sort(axis=1) sortx = np.argsort(df.sum(axis=1).values)[::-1] df = df.take(sortx, axis=0) df = df.take(sortx, axis=1) fig = pt.figure(fnum=1) fig.clf() mat = df.values.astype(np.int32) mat[np.diag_indices(len(mat))] = 0 vmax = mat[(1 - np.eye(len(mat))).astype(np.bool)].max() import matplotlib.colors norm = matplotlib.colors.Normalize(vmin=0, vmax=vmax, clip=True) pt.plt.imshow(mat, cmap='hot', norm=norm, interpolation='none') pt.plt.colorbar() pt.plt.grid('off') label_ticks(label_texts) fig.tight_layout() # overlap_df = pd.DataFrame.from_dict(overlap_img_list) class TmpImage(ut.NiceRepr): pass from skimage.feature import hog from skimage import data, color, exposure import wbia.plottool as pt image2 = color.rgb2gray(data.astronaut()) # NOQA fpath = './GOPR1120.JPG' import vtool as vt for fpath in [fpath]: """ http://scikit-image.org/docs/dev/auto_examples/plot_hog.html """ image = vt.imread(fpath, grayscale=True) image = pt.color_funcs.to_base01(image) fig = pt.figure(fnum=2) fd, hog_image = hog( image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualise=True, ) fig, (ax1, ax2) = pt.plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True) ax1.axis('off') ax1.imshow(image, cmap=pt.plt.cm.gray) ax1.set_title('Input image') ax1.set_adjustable('box-forced') # Rescale histogram for better display hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 0.02)) ax2.axis('off') ax2.imshow(hog_image_rescaled, cmap=pt.plt.cm.gray) ax2.set_title('Histogram of Oriented Gradients') ax1.set_adjustable('box-forced') pt.plt.show()