def compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list): """ Driver function for agg residual computation Sums and normalizes all rvecs that belong to the same word and the same annotation id Args: rvecs_list (list): residual vectors grouped by word idxs_list (list): stacked descriptor indexes grouped by word aids_list (list): annotation rowid for each stacked descriptor index maws_list (list): multi assign weights Returns: tuple : (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list) CommandLine: python -m ibeis.algo.hots.smk.smk_residuals --test-compute_agg_rvecs Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_residuals import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> from ibeis.algo.hots.smk import smk_residuals >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec() >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list) >>> tup = compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list) >>> aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list = tup >>> ut.assert_eq(len(wx_sublist), len(rvecs_list)) """ #assert len(idxs_list) == len(rvecs_list) # group members of each word by aid, we will collapse these groups grouptup_list = [clustertool.group_indices(aids) for aids in aids_list] # Agg aids aggaids_list = [tup[0] for tup in grouptup_list] groupxs_list = [tup[1] for tup in grouptup_list] # Aggregate vecs that belong to the same aid, for each word # (weighted aggregation with multi-assign-weights) aggvecs_list = [ np.vstack([ aggregate_rvecs(rvecs.take(xs, axis=0), maws.take(xs)) for xs in groupxs ]) if len(groupxs) > 0 else np.empty( (0, hstypes.VEC_DIM), dtype=hstypes.FLOAT_TYPE) for rvecs, maws, groupxs in zip(rvecs_list, maws_list, groupxs_list) ] # Agg idxs aggidxs_list = [[idxs.take(xs) for xs in groupxs] for idxs, groupxs in zip(idxs_list, groupxs_list)] aggmaws_list = [ np.array([maws.take(xs).prod() for xs in groupxs]) for maws, groupxs in zip(maws_list, groupxs_list) ] # Need to recompute flags for consistency # flag is true when aggvec is all zeros aggflags_list = [~np.any(aggvecs, axis=1) for aggvecs in aggvecs_list] return aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list
def group_correspondences(all_matches, all_scores, all_daids, daid2_sccw): daid_keys, groupxs = clustertool.group_indices(all_daids) fs_list = clustertool.apply_grouping(all_scores, groupxs) fm_list = clustertool.apply_grouping(all_matches, groupxs) daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)} daid2_fs = {daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list)} # FIXME: generalize to when nAssign > 1 daid2_fk = {daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list)} daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk) return daid2_chipmatch
def compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list): """ Driver function for agg residual computation Sums and normalizes all rvecs that belong to the same word and the same annotation id Args: rvecs_list (list): residual vectors grouped by word idxs_list (list): stacked descriptor indexes grouped by word aids_list (list): annotation rowid for each stacked descriptor index maws_list (list): multi assign weights Returns: tuple : (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list) CommandLine: python -m ibeis.algo.hots.smk.smk_residuals --test-compute_agg_rvecs Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_residuals import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> from ibeis.algo.hots.smk import smk_residuals >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec() >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list) >>> tup = compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list) >>> aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list = tup >>> ut.assert_eq(len(wx_sublist), len(rvecs_list)) """ #assert len(idxs_list) == len(rvecs_list) # group members of each word by aid, we will collapse these groups grouptup_list = [clustertool.group_indices(aids) for aids in aids_list] # Agg aids aggaids_list = [tup[0] for tup in grouptup_list] groupxs_list = [tup[1] for tup in grouptup_list] # Aggregate vecs that belong to the same aid, for each word # (weighted aggregation with multi-assign-weights) aggvecs_list = [ np.vstack([aggregate_rvecs(rvecs.take(xs, axis=0), maws.take(xs)) for xs in groupxs]) if len(groupxs) > 0 else np.empty((0, hstypes.VEC_DIM), dtype=hstypes.FLOAT_TYPE) for rvecs, maws, groupxs in zip(rvecs_list, maws_list, groupxs_list)] # Agg idxs aggidxs_list = [[idxs.take(xs) for xs in groupxs] for idxs, groupxs in zip(idxs_list, groupxs_list)] aggmaws_list = [np.array([maws.take(xs).prod() for xs in groupxs]) for maws, groupxs in zip(maws_list, groupxs_list)] # Need to recompute flags for consistency # flag is true when aggvec is all zeros aggflags_list = [~np.any(aggvecs, axis=1) for aggvecs in aggvecs_list] return aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list
def group_correspondences(all_matches, all_scores, all_daids, daid2_sccw): daid_keys, groupxs = clustertool.group_indices(all_daids) fs_list = clustertool.apply_grouping(all_scores, groupxs) fm_list = clustertool.apply_grouping(all_matches, groupxs) daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)} daid2_fs = { daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list) } # FIXME: generalize to when nAssign > 1 daid2_fk = { daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list) } daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk) return daid2_chipmatch
def compute_idf_label1(aids_list, daid2_label): """ One of our idf extensions Example: >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ >>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids) """ nWords = len(aids_list) # Computes our novel label idf weight lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values())) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] daid2_wxs = ut.ddict(list) for wx, daids in enumerate(aids_list): for daid in daids: daid2_wxs[daid].append(wx) lblindex2_daids = list(zip(unique_lblindexes, daids_list)) nLabels = len(unique_lblindexes) pcntLblsWithWord = np.zeros(nWords, np.float64) # Get num times word appears for eachlabel for lblindex, daids in lblindex2_daids: nWordsWithLabel = np.zeros(nWords) for daid in daids: wxs = daid2_wxs[daid] nWordsWithLabel[wxs] += 1 pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids)) # Labels for each word idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1), dtype=hstypes.FLOAT_TYPE), dtype=hstypes.FLOAT_TYPE) return idf_list
def build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw): """ Builds explicit chipmatches that the rest of the pipeline plays nice with Notation: An explicit cmtup_old is a tuple (fm, fs, fk) feature_matches, feature_scores, and feature_ranks. Let N be the number of matches A feature match, fm{shape=(N, 2), dtype=int32}, is an array where the first column corresponds to query_feature_indexes (qfx) and the second column corresponds to database_feature_indexes (dfx). A feature score, fs{shape=(N,), dtype=float64} is an array of scores A feature rank, fk{shape=(N,), dtype=int16} is an array of ranks Returns: daid2_chipmatch (dict) : (daid2_fm, daid2_fs, daid2_fk) Return Format:: daid2_fm (dict): {daid: fm, ...} daid2_fs (dict): {daid: fs, ...} daid2_fk (dict): {daid: fk, ...} Example: >>> from ibeis.algo.hots.smk.smk_core import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, invindex, qindex, qparams = smk_debug.testdata_match_kernel_L2() >>> wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw = qindex >>> smk_alpha = ibs.cfg.query_cfg.smk_cfg.smk_alpha >>> smk_thresh = ibs.cfg.query_cfg.smk_cfg.smk_thresh >>> withinfo = True # takes an 11s vs 2s >>> args = (wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw, invindex, withinfo, smk_alpha, smk_thresh) >>> retL1 = match_kernel_L1(*args) >>> (daid2_totalscore, common_wxs, scores_list, daids_list, idf_list, daid_agg_keys,) = retL1 >>> daid2_chipmatch_old = build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) >>> daid2_chipmatch_new = build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) >>> print(utool.is_dicteq(daid2_chipmatch_old[0], daid2_chipmatch_new[0])) >>> print(utool.is_dicteq(daid2_chipmatch_old[2], daid2_chipmatch_new[2])) >>> print(utool.is_dicteq(daid2_chipmatch_old[1], daid2_chipmatch_new[1])) %timeit build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) %timeit build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) """ # FIXME: move groupby to vtool if utool.VERBOSE: print('[smk_core] build cmtup_old') wx2_dfxs = invindex.wx2_fxs daid2_sccw = invindex.daid2_sccw qfxs_list = [wx2_qfxs[wx] for wx in common_wxs] dfxs_list = [wx2_dfxs[wx] for wx in common_wxs] shapes_list = [scores.shape for scores in scores_list] # 51us shape_ranges = [(mem_arange(w), mem_arange(h)) for (w, h) in shapes_list] # 230us ijs_list = [ mem_meshgrid(wrange, hrange) for (wrange, hrange) in shape_ranges ] # 278us # Normalize scores for words, nMatches, and query sccw (still need daid sccw) nscores_iter = (scores * query_sccw for scores in scores_list) # FIXME: Preflatten all of these lists out_ijs = [list(zip(_is.flat, _js.flat)) for (_is, _js) in ijs_list] out_qfxs = [[qfxs[ix] for (ix, jx) in ijs] for (qfxs, ijs) in zip(qfxs_list, out_ijs)] out_dfxs = [[dfxs[jx] for (ix, jx) in ijs] for (dfxs, ijs) in zip(dfxs_list, out_ijs)] out_daids = ([daids[jx] for (ix, jx) in ijs] for (daids, ijs) in zip(daids_list, out_ijs)) out_scores = ([nscores[ijx] for ijx in ijs] for (nscores, ijs) in zip(nscores_iter, out_ijs)) nested_fm_iter = [[ tuple(product(qfxs_, dfxs_)) for qfxs_, dfxs_ in zip(qfxs, dfxs) ] for qfxs, dfxs in zip(out_qfxs, out_dfxs)] all_fms = np.array(list(utool.iflatten(utool.iflatten(nested_fm_iter))), dtype=hstypes.FM_DTYPE) nested_nmatch_list = [[len(fm) for fm in fms] for fms in nested_fm_iter] nested_daid_iter = ([ [daid] * nMatch for nMatch, daid in zip(nMatch_list, daids) ] for nMatch_list, daids in zip(nested_nmatch_list, out_daids)) nested_score_iter = ([ [score / nMatch] * nMatch for nMatch, score in zip(nMatch_list, scores) ] for nMatch_list, scores in zip(nested_nmatch_list, out_scores)) all_daids_ = np.array(list(utool.iflatten( utool.iflatten(nested_daid_iter))), dtype=hstypes.INDEX_TYPE) all_fss = np.array(list(utool.iflatten(utool.iflatten(nested_score_iter))), dtype=hstypes.FS_DTYPE) # Filter out 0 scores keep_xs = np.where(all_fss > 0)[0] all_fss = all_fss.take(keep_xs) all_fms = all_fms.take(keep_xs, axis=0) all_daids_ = all_daids_.take(keep_xs) daid_keys, groupxs = clustertool.group_indices(all_daids_) fs_list = clustertool.apply_grouping(all_fss, groupxs) fm_list = clustertool.apply_grouping(all_fms, groupxs) daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)} daid2_fs = { daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list) } # FIXME: generalize to when nAssign > 1 daid2_fk = { daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list) } daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk) return daid2_chipmatch
def build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw): """ Builds explicit chipmatches that the rest of the pipeline plays nice with Notation: An explicit cmtup_old is a tuple (fm, fs, fk) feature_matches, feature_scores, and feature_ranks. Let N be the number of matches A feature match, fm{shape=(N, 2), dtype=int32}, is an array where the first column corresponds to query_feature_indexes (qfx) and the second column corresponds to database_feature_indexes (dfx). A feature score, fs{shape=(N,), dtype=float64} is an array of scores A feature rank, fk{shape=(N,), dtype=int16} is an array of ranks Returns: daid2_chipmatch (dict) : (daid2_fm, daid2_fs, daid2_fk) Return Format:: daid2_fm (dict): {daid: fm, ...} daid2_fs (dict): {daid: fs, ...} daid2_fk (dict): {daid: fk, ...} Example: >>> from ibeis.algo.hots.smk.smk_core import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, invindex, qindex, qparams = smk_debug.testdata_match_kernel_L2() >>> wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw = qindex >>> smk_alpha = ibs.cfg.query_cfg.smk_cfg.smk_alpha >>> smk_thresh = ibs.cfg.query_cfg.smk_cfg.smk_thresh >>> withinfo = True # takes an 11s vs 2s >>> args = (wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw, invindex, withinfo, smk_alpha, smk_thresh) >>> retL1 = match_kernel_L1(*args) >>> (daid2_totalscore, common_wxs, scores_list, daids_list, idf_list, daid_agg_keys,) = retL1 >>> daid2_chipmatch_old = build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) >>> daid2_chipmatch_new = build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) >>> print(utool.is_dicteq(daid2_chipmatch_old[0], daid2_chipmatch_new[0])) >>> print(utool.is_dicteq(daid2_chipmatch_old[2], daid2_chipmatch_new[2])) >>> print(utool.is_dicteq(daid2_chipmatch_old[1], daid2_chipmatch_new[1])) %timeit build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) %timeit build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw) """ # FIXME: move groupby to vtool if utool.VERBOSE: print('[smk_core] build cmtup_old') wx2_dfxs = invindex.wx2_fxs daid2_sccw = invindex.daid2_sccw qfxs_list = [wx2_qfxs[wx] for wx in common_wxs] dfxs_list = [wx2_dfxs[wx] for wx in common_wxs] shapes_list = [scores.shape for scores in scores_list] # 51us shape_ranges = [(mem_arange(w), mem_arange(h)) for (w, h) in shapes_list] # 230us ijs_list = [mem_meshgrid(wrange, hrange) for (wrange, hrange) in shape_ranges] # 278us # Normalize scores for words, nMatches, and query sccw (still need daid sccw) nscores_iter = (scores * query_sccw for scores in scores_list) # FIXME: Preflatten all of these lists out_ijs = [ list(zip(_is.flat, _js.flat)) for (_is, _js) in ijs_list ] out_qfxs = [ [qfxs[ix] for (ix, jx) in ijs] for (qfxs, ijs) in zip(qfxs_list, out_ijs) ] out_dfxs = [ [dfxs[jx] for (ix, jx) in ijs] for (dfxs, ijs) in zip(dfxs_list, out_ijs) ] out_daids = ( [daids[jx] for (ix, jx) in ijs] for (daids, ijs) in zip(daids_list, out_ijs) ) out_scores = ( [nscores[ijx] for ijx in ijs] for (nscores, ijs) in zip(nscores_iter, out_ijs) ) nested_fm_iter = [ [ tuple(product(qfxs_, dfxs_)) for qfxs_, dfxs_ in zip(qfxs, dfxs) ] for qfxs, dfxs in zip(out_qfxs, out_dfxs) ] all_fms = np.array(list(utool.iflatten(utool.iflatten(nested_fm_iter))), dtype=hstypes.FM_DTYPE) nested_nmatch_list = [[len(fm) for fm in fms] for fms in nested_fm_iter] nested_daid_iter = ( [ [daid] * nMatch for nMatch, daid in zip(nMatch_list, daids) ] for nMatch_list, daids in zip(nested_nmatch_list, out_daids) ) nested_score_iter = ( [ [score / nMatch] * nMatch for nMatch, score in zip(nMatch_list, scores) ] for nMatch_list, scores in zip(nested_nmatch_list, out_scores) ) all_daids_ = np.array(list(utool.iflatten(utool.iflatten(nested_daid_iter))), dtype=hstypes.INDEX_TYPE) all_fss = np.array(list(utool.iflatten(utool.iflatten(nested_score_iter))), dtype=hstypes.FS_DTYPE) # Filter out 0 scores keep_xs = np.where(all_fss > 0)[0] all_fss = all_fss.take(keep_xs) all_fms = all_fms.take(keep_xs, axis=0) all_daids_ = all_daids_.take(keep_xs) daid_keys, groupxs = clustertool.group_indices(all_daids_) fs_list = clustertool.apply_grouping(all_fss, groupxs) fm_list = clustertool.apply_grouping(all_fms, groupxs) daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)} daid2_fs = {daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list)} # FIXME: generalize to when nAssign > 1 daid2_fk = {daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list)} daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk) return daid2_chipmatch
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose=False): """ Computes sccw normalization scalar for the database annotations. This is gamma from the SMK paper. sccw is a self consistency critiron weight --- a scalar which ensures the score of K(X, X) = 1 Args: idx2_daid (): wx2_drvecs (): wx2_aids (): wx2_idf (): wx2_dmaws (): smk_alpha (): smk_thresh (): Returns: daid2_sccw Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_index >>> from ibeis.algo.hots.smk import smk_debug >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1') >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST') >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup >>> wx2_dflags = invindex.wx2_dflags >>> ws2_idxs = invindex.wx2_idxs >>> wx2_dmaws = invindex.wx2_dmaws >>> idx2_daid = invindex.idx2_daid >>> daids = invindex.daids >>> smk_alpha = qparams.smk_alpha >>> smk_thresh = qparams.smk_thresh >>> wx2_idf = wx2_idf >>> verbose = True >>> invindex.invindex_dbgstr() >>> invindex.report_memory() >>> invindex.report_memsize() >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose) """ #for wx in wx_sublist: # print(len(wx2_dmaws verbose_ = ut.VERBOSE or verbose if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids) if not ut.QUIET: print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight') if verbose_: print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh)) mark1, end1_ = ut.log_progress( '[smk_index.sccw] SCCW group (by present words): ', len(wx2_drvecs), freq=100, with_time=WITH_TOTALTIME) # Group by daids first and then by word index # Get list of aids and rvecs w.r.t. words (ie one item per word) wx_sublist = np.array(list(wx2_drvecs.keys())) aids_perword = [wx2_aids[wx] for wx in wx_sublist] # wx_list1: Lays out word indexes for each annotation # tx_list1: Temporary within annotation subindex + wx uniquely identifies # item in wx2_drvecs, wx2_dflags, and wx2_dmaws # Flatten out indexes to perform grouping flat_aids = np.hstack(aids_perword) count = len(flat_aids) txs_perword = [np.arange(aids.size) for aids in aids_perword] flat_txs = np.hstack(txs_perword) # fromiter is faster for flat_wxs because is not a list of numpy arrays wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword)) flat_wxs = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count) # Group flat indexes by annotation id unique_aids, annot_groupxs = clustertool.group_indices(flat_aids) # Wxs and Txs grouped by annotation id wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs) txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs) # Group by word inside each annotation group wxsubgrouping_perannot = [clustertool.group_indices(wxs) for wxs in wxs_perannot] word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot) txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs) for txs, groupxs in zip(txs_perannot, word_groupxs_perannot)] wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot] # Group relavent data for sccw measure by word for each annotation grouping def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot): return [[wx2_arr[wx].take(txs, axis=0) for wx, txs in zip(wx_perword_, txs_perword_)] for wx_perword_, txs_perword_ in zip(wxs_perword_perannot, txs_perword_perannot)] def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot): return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot] subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot) subgrouped_dmaws = _vector_subgroup_by_wx(wx2_dmaws, wxs_perword_perannot, txs_perword_perannot) # If we aren't using dmaws replace it with an infinite None iterator #subgrouped_dmaws = iter(lambda: None, 1) subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot) #subgrouped_dflags = iter(lambda: None, 1) subgrouped_idfs = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot) if verbose_: end1_() mark2, end2_ = ut.log_progress(lbl='[smk_index.sccw] SCCW Sum (over daid): ', total=len(unique_aids), freq=100, with_time=WITH_TOTALTIME) progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ', total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME) else: progiter = ut.identity if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs) sccw_list = [ smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh) for rvecs_list, flags_list, maws_list, idf_list in progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs)) ] daid2_sccw = dict(zip(unique_aids, sccw_list)) if verbose_: end2_() print('[smk_index.sccw] L___ End Compute Data SCCW\n') return daid2_sccw
def compute_negentropy_names(aids_list, daid2_label): r""" One of our idf extensions Word weighting based on the negative entropy over all names of p(n_i | word) Args: aids_list (list of aids): daid2_label (dict from daid to label): Returns: negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ Math:: p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word) p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)} p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl') p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel = \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}} h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word) word_weight = log(N) - h(n | word) CommandLine: python dev.py -t smk2 --allgt --db GZ_ALL python dev.py -t smk5 --allgt --db GZ_ALL Auto: python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')" """ nWords = len(aids_list) # --- LABEL MEMBERS w.r.t daids --- # compute mapping from label to daids # Translate tuples into scalars for efficiency label_list = list(daid2_label.values()) lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list)) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] # --- DAID MEMBERS w.r.t. words --- # compute mapping from daid to word indexes # finds all the words that belong to an annotation daid2_wxs = ut.ddict(list) for wx, _daids in enumerate(aids_list): for daid in _daids: daid2_wxs[daid].append(wx) # --- \Pr(\word \given \lbl) for each label --- # Compute the number of annotations in a label with the word vs # the number of annotations in the label lblindex2_daids = list(zip(unique_lblindexes, daids_list)) # Get num times word appears for each label probWordGivenLabel_list = [] for lblindex, _daids in lblindex2_daids: nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32) for daid in _daids: wxs = np.unique(daid2_wxs[daid]) nAnnotOfLabelWithWord[wxs] += 1 probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids) probWordGivenLabel_list.append(probWordGivenLabel) # (nLabels, nWords) probWordGivenLabel_arr = np.array(probWordGivenLabel_list) # --- \Pr(\lbl \given \word) --- # compute partition function that approximates probability of a word # (1, nWords) probWord = probWordGivenLabel_arr.sum(axis=0) probWord.shape = (1, probWord.size) # (nLabels, nWords) probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord) # --- \Pr(\name \given \lbl) --- # get names for each unique label nid_list = np.array([label_list[xs[0]][0] for xs in groupxs]) unique_nids, groupxs_ = clustertool.group_indices(nid_list) # (nNames, nWords) # add a little wiggle room eps = 1E-9 # http://stackoverflow.com/questions/872544/precision-of-floating-point #epsilon = 2^(E-52) % For a 64-bit float (double precision) #epsilon = 2^(E-23) % For a 32-bit float (single precision) #epsilon = 2^(E-10) % For a 16-bit float (half precision) probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_]) logProbNameGivenWord = np.log(probNameGivenWord) wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0) # Compute negative entropy for weights nNames = len(nid_list) negentropy_list = np.log(nNames) - wordNameEntropy return negentropy_list
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose=False): """ Computes sccw normalization scalar for the database annotations. This is gamma from the SMK paper. sccw is a self consistency critiron weight --- a scalar which ensures the score of K(X, X) = 1 Args: idx2_daid (): wx2_drvecs (): wx2_aids (): wx2_idf (): wx2_dmaws (): smk_alpha (): smk_thresh (): Returns: daid2_sccw Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_index >>> from ibeis.algo.hots.smk import smk_debug >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1') >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST') >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup >>> wx2_dflags = invindex.wx2_dflags >>> ws2_idxs = invindex.wx2_idxs >>> wx2_dmaws = invindex.wx2_dmaws >>> idx2_daid = invindex.idx2_daid >>> daids = invindex.daids >>> smk_alpha = qparams.smk_alpha >>> smk_thresh = qparams.smk_thresh >>> wx2_idf = wx2_idf >>> verbose = True >>> invindex.invindex_dbgstr() >>> invindex.report_memory() >>> invindex.report_memsize() >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose) """ #for wx in wx_sublist: # print(len(wx2_dmaws verbose_ = ut.VERBOSE or verbose if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids) if not ut.QUIET: print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight') if verbose_: print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh)) # Group by daids first and then by word index # Get list of aids and rvecs w.r.t. words (ie one item per word) wx_sublist = np.array(list(wx2_drvecs.keys())) aids_perword = [wx2_aids[wx] for wx in wx_sublist] # wx_list1: Lays out word indexes for each annotation # tx_list1: Temporary within annotation subindex + wx uniquely identifies # item in wx2_drvecs, wx2_dflags, and wx2_dmaws # Flatten out indexes to perform grouping flat_aids = np.hstack(aids_perword) count = len(flat_aids) txs_perword = [np.arange(aids.size) for aids in aids_perword] flat_txs = np.hstack(txs_perword) # fromiter is faster for flat_wxs because is not a list of numpy arrays wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword)) flat_wxs = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count) # Group flat indexes by annotation id unique_aids, annot_groupxs = clustertool.group_indices(flat_aids) # Wxs and Txs grouped by annotation id wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs) txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs) # Group by word inside each annotation group wxsubgrouping_perannot = [clustertool.group_indices(wxs) for wxs in wxs_perannot] word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot) txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs) for txs, groupxs in zip(txs_perannot, word_groupxs_perannot)] wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot] # Group relavent data for sccw measure by word for each annotation grouping def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot): return [[wx2_arr[wx].take(txs, axis=0) for wx, txs in zip(wx_perword_, txs_perword_)] for wx_perword_, txs_perword_ in zip(wxs_perword_perannot, txs_perword_perannot)] def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot): return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot] subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot) subgrouped_dmaws = _vector_subgroup_by_wx(wx2_dmaws, wxs_perword_perannot, txs_perword_perannot) # If we aren't using dmaws replace it with an infinite None iterator #subgrouped_dmaws = iter(lambda: None, 1) subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot) #subgrouped_dflags = iter(lambda: None, 1) subgrouped_idfs = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot) if verbose_: progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ', total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME) else: progiter = ut.identity if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs) sccw_list = [ smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh) for rvecs_list, flags_list, maws_list, idf_list in progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs)) ] daid2_sccw = dict(zip(unique_aids, sccw_list)) if verbose_: print('[smk_index.sccw] L___ End Compute Data SCCW\n') return daid2_sccw
def compute_negentropy_names(aids_list, daid2_label): r""" One of our idf extensions Word weighting based on the negative entropy over all names of p(n_i | word) Args: aids_list (list of aids): daid2_label (dict from daid to label): Returns: negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ Math:: p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word) p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)} p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl') p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel = \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}} h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word) word_weight = log(N) - h(n | word) CommandLine: python dev.py -t smk2 --allgt --db GZ_ALL python dev.py -t smk5 --allgt --db GZ_ALL Auto: python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')" """ nWords = len(aids_list) # --- LABEL MEMBERS w.r.t daids --- # compute mapping from label to daids # Translate tuples into scalars for efficiency label_list = list(daid2_label.values()) lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list)) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] # --- DAID MEMBERS w.r.t. words --- # compute mapping from daid to word indexes # finds all the words that belong to an annotation daid2_wxs = ut.ddict(list) for wx, _daids in enumerate(aids_list): for daid in _daids: daid2_wxs[daid].append(wx) # --- \Pr(\word \given \lbl) for each label --- # Compute the number of annotations in a label with the word vs # the number of annotations in the label lblindex2_daids = list(zip(unique_lblindexes, daids_list)) # Get num times word appears for each label probWordGivenLabel_list = [] for lblindex, _daids in lblindex2_daids: nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32) for daid in _daids: wxs = np.unique(daid2_wxs[daid]) nAnnotOfLabelWithWord[wxs] += 1 probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids) probWordGivenLabel_list.append(probWordGivenLabel) # (nLabels, nWords) probWordGivenLabel_arr = np.array(probWordGivenLabel_list) # --- \Pr(\lbl \given \word) --- # compute partition function that approximates probability of a word # (1, nWords) probWord = probWordGivenLabel_arr.sum(axis=0) probWord.shape = (1, probWord.size) # (nLabels, nWords) probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord) # --- \Pr(\name \given \lbl) --- # get names for each unique label nid_list = np.array([label_list[xs[0]][0] for xs in groupxs]) unique_nids, groupxs_ = clustertool.group_indices(nid_list) # (nNames, nWords) # add a little wiggle room eps = 1E-9 # http://stackoverflow.com/questions/872544/precision-of-floating-point #epsilon = 2^(E-52) # For a 64-bit float (double precision) #epsilon = 2^(E-23) # For a 32-bit float (single precision) #epsilon = 2^(E-10) # For a 16-bit float (half precision) probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_]) logProbNameGivenWord = np.log(probNameGivenWord) wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0) # Compute negative entropy for weights nNames = len(nid_list) negentropy_list = np.log(nNames) - wordNameEntropy return negentropy_list