def OLD_compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose=False): """ """ if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.rrr() smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids) with ut.Timer('timer_orig1'): wx_sublist = np.array(wx2_drvecs.keys()) if not ut.QUIET: print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight') if ut.VERBOSE or verbose: print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh)) mark1, end1_ = ut.log_progress( '[smk_index.sccw] SCCW group (by present words): ', len(wx_sublist), freq=100, with_time=WITH_TOTALTIME) # Get list of aids and rvecs w.r.t. words aids_list = [wx2_aids[wx] for wx in wx_sublist] rvecs_list1 = [wx2_drvecs[wx] for wx in wx_sublist] maws_list = [wx2_dmaws[wx] for wx in wx_sublist] if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.assert_single_assigned_maws(maws_list) # Group by daids first and then by word index daid2_wx2_drvecs = clustertool.double_group(wx_sublist, aids_list, rvecs_list1) if ut.VERBOSE or verbose: end1_() # For every daid, compute its sccw using pregrouped rvecs # Summation over words for each aid if ut.VERBOSE or verbose: mark2, end2_ = ut.log_progress( '[smk_index.sccw] SCCW Sum (over daid): ', len(daid2_wx2_drvecs), freq=25, with_time=WITH_TOTALTIME) # Get lists w.r.t daids aid_list = list(daid2_wx2_drvecs.keys()) # list of mappings from words to rvecs foreach daid # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,] _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) _aidwxs_iter = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list) aidrvecs_list = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list] aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter] with ut.Timer('timer_orig2'): if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_data_smksumm(aididf_list, aidrvecs_list) # TODO: implement database side soft-assign sccw_list = [smk_scoring.sccw_summation(rvecs_list, None, idf_list, None, smk_alpha, smk_thresh) for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)] daid2_sccw = dict(zip(aid_list, sccw_list)) if ut.VERBOSE or verbose: end2_() print('[smk_index.sccw] L___ End Compute Data SCCW\n') return daid2_sccw
def compute_data_gamma_(invindex, use_cache=True): """ >>> from ibeis.model.hots.smk.smk import * # NOQA >>> ibs, annots_df, taids, daids, qaids, nWords = testdata() >>> words = learn_visual_words(annots_df, taids, nWords) >>> with_internals = True >>> invindex = index_data_annots(annots_df, daids, words, with_internals) >>> daid2_gamma = compute_data_gamma_(invindex, use_cache=True) """ cache_key = utool.hashstr(invindex.get_cfgstr()) if use_cache: try: daid2_gamma = utool.global_cache_read(cache_key, appname='smk') #print('gamma_dbg cache hit') return daid2_gamma except Exception: pass # Gropuing by aid and words mark, end_ = utool.log_progress(('gamma grouping %s ' % (cache_key, )), invindex.wx2_drvecs.shape[0], flushfreq=100) daid2_wx2_drvecs = utool.ddict(dict) for count, wx in enumerate(invindex.wx2_drvecs.index): if count % 100 == 0: mark(wx) group = invindex.wx2_drvecs[wx].groupby(invindex.idx2_daid) for daid, vecs in group: daid2_wx2_drvecs[daid][wx] = vecs.values end_() # Summation over words for each aid mark, end_ = utool.log_progress('gamma summation ', len(daid2_wx2_drvecs), flushfreq=100) daid2_gamma = pd.Series(np.zeros(invindex.daids.shape[0]), index=invindex.daids, name='gamma') wx2_weight = invindex.wx2_weight for count, (daid, wx2_drvecs) in enumerate(six.iteritems(daid2_wx2_drvecs)): if count % 100 == 0: mark(count) wx2_rvecs = wx2_drvecs daid2_gamma[daid] = gamma_summation(wx2_rvecs, wx2_weight) utool.global_cache_write(cache_key, daid2_gamma, appname='smk') return daid2_gamma
def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids, daid2_label=None, vocab_weighting='idf', verbose=False): """ Computes the inverse-document-frequency weighting for each word Args: wx_series (): wx2_idxs (): idx2_aid (): daids (): daid2_label (): vocab_weighting (): Returns: wx2_idf Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids) >>> result = str(len(wx2_idf)) >>> print(result) 8000 Ignore: #>>> wx2_idxs = invindex.wx2_idxs Auto: from ibeis.algo.hots.smk import smk_index import utool as ut; print(ut.make_default_docstr(smk_index.compute_word_idf_)) """ if not ut.QUIET: print('[smk_index.idf] +--- Start Compute IDF') if ut.VERBOSE or verbose: mark, end_ = ut.log_progress('[smk_index.idf] Word IDFs: ', len(wx_series), freq=50, with_time=WITH_TOTALTIME) idxs_list, aids_list = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) # TODO: Integrate different idf measures if vocab_weighting == 'idf': idf_list = compute_idf_orig(aids_list, daids) elif vocab_weighting == 'negentropy': assert daid2_label is not None idf_list = compute_idf_label1(aids_list, daid2_label) else: raise AssertionError('unknown option vocab_weighting=%r' % vocab_weighting) if ut.VERBOSE or verbose: end_() print('[smk_index.idf] L___ End Compute IDF') wx2_idf = dict(zip(wx_series, idf_list)) return wx2_idf
def compute_data_gamma_(invindex, use_cache=True): """ >>> from ibeis.model.hots.smk.smk import * # NOQA >>> ibs, annots_df, taids, daids, qaids, nWords = testdata() >>> words = learn_visual_words(annots_df, taids, nWords) >>> with_internals = True >>> invindex = index_data_annots(annots_df, daids, words, with_internals) >>> daid2_gamma = compute_data_gamma_(invindex, use_cache=True) """ cache_key = utool.hashstr(invindex.get_cfgstr()) if use_cache: try: daid2_gamma = utool.global_cache_read(cache_key, appname='smk') #print('gamma_dbg cache hit') return daid2_gamma except Exception: pass # Gropuing by aid and words mark, end_ = utool.log_progress(('gamma grouping %s ' % (cache_key,)), invindex.wx2_drvecs.shape[0], flushfreq=100) daid2_wx2_drvecs = utool.ddict(dict) for count, wx in enumerate(invindex.wx2_drvecs.index): if count % 100 == 0: mark(wx) group = invindex.wx2_drvecs[wx].groupby(invindex.idx2_daid) for daid, vecs in group: daid2_wx2_drvecs[daid][wx] = vecs.values end_() # Summation over words for each aid mark, end_ = utool.log_progress('gamma summation ', len(daid2_wx2_drvecs), flushfreq=100) daid2_gamma = pd.Series( np.zeros(invindex.daids.shape[0]), index=invindex.daids, name='gamma') wx2_weight = invindex.wx2_weight for count, (daid, wx2_drvecs) in enumerate(six.iteritems(daid2_wx2_drvecs)): if count % 100 == 0: mark(count) wx2_rvecs = wx2_drvecs daid2_gamma[daid] = gamma_summation(wx2_rvecs, wx2_weight) utool.global_cache_write(cache_key, daid2_gamma, appname='smk') return daid2_gamma
def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids, with_pandas=WITH_PANDAS): """ Returns the inverse-document-frequency weighting for each word internals step 2 >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs = smk_debug.testdata_raw_internals1() >>> wx_series = invindex.words.index >>> idx2_aid = invindex.idx2_daid >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids) >>> print(wx2_idf.shape) (8000,) #>>> wx2_idxs = invindex.wx2_idxs """ if utool.VERBOSE: mark, end_ = utool.log_progress('[smk_index] Word IDFs: ', len(wx_series), flushfreq=500, writefreq=50) mark(0) wx_series_values = pdh.ensure_values(wx_series) idx2_aid_values = pdh.ensure_values(idx2_aid) wx2_idxs_values = pdh.ensure_values_subset(wx2_idxs, wx_series_values) #with utool.Timer('method 1'): # 0.16s idxs_list = [ pdh.ensure_values(idxs).astype(INDEX_TYPE) for idxs in wx2_idxs_values ] # 11% aids_list = [ idx2_aid_values.take(idxs) if len(idxs) > 0 else [] for idxs in idxs_list ] nTotalDocs = len(daids) nDocsWithWord_list = [len(set(aids)) for aids in aids_list] # 68% # compute idf half of tf-idf weighting idf_list = [ np.log(nTotalDocs / nDocsWithWord).astype(FLOAT_TYPE) if nDocsWithWord > 0 else 0.0 for nDocsWithWord in nDocsWithWord_list ] # 17.8 ms # 13% if utool.VERBOSE: end_() if with_pandas: wx2_idf = pdh.IntSeries(idf_list, index=wx_series, name='idf') else: wx2_idf = dict(zip(wx_series_values, idf_list)) return wx2_idf
def match_kernel_L1(qindex, invindex, qparams): """ Builds up information and does verbosity before going to L0 """ # Unpack Query (wx2_qrvecs, wx2_qflags, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw) = qindex # Unpack Database wx2_drvecs = invindex.wx2_drvecs wx2_idf = invindex.wx2_idf wx2_daid = invindex.wx2_aids wx2_dflags = invindex.wx2_dflags daid2_sccw = invindex.daid2_sccw smk_alpha = qparams.smk_alpha smk_thresh = qparams.smk_thresh # for each word compute the pairwise scores between matches common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys())) # Build lists over common word indexes qrvecs_list = [ wx2_qrvecs[wx] for wx in common_wxs] drvecs_list = [ wx2_drvecs[wx] for wx in common_wxs] daids_list = [ wx2_daid[wx] for wx in common_wxs] idf_list = [ wx2_idf[wx] for wx in common_wxs] qmaws_list = [ wx2_qmaws[wx] for wx in common_wxs] # NOQA dflags_list = [ wx2_dflags[wx] for wx in common_wxs] # NOQA qflags_list = [ wx2_qflags[wx] for wx in common_wxs] dmaws_list = None if utool.VERBOSE: mark, end_ = utool.log_progress('[smk_core] query word: ', len(common_wxs), flushfreq=100, writefreq=25, with_totaltime=True) #-------- retL0 = match_kernel_L0(qrvecs_list, drvecs_list, qflags_list, dflags_list, qmaws_list, dmaws_list, smk_alpha, smk_thresh, idf_list, daids_list, daid2_sccw, query_sccw) (daid2_totalscore, scores_list, daid_agg_keys) = retL0 #print('[smk_core] Matched %d daids' % daid2_totalscore.keys()) #utool.embed() retL1 = (daid2_totalscore, common_wxs, scores_list, daids_list) #-------- if utool.VERBOSE: end_() print('[smk_core] Matched %d daids. nAssign=%r' % (len(daid2_totalscore.keys()), qparams.nAssign)) return retL1
def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids): """ Returns the inverse-document-frequency weighting for each word internals step 2 >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs = smk_debug.testdata_raw_internals1() >>> wx_series = invindex.words.index >>> idx2_aid = invindex.idx2_daid >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids) >>> print(wx2_idf.shape) (8000,) #>>> wx2_idxs = invindex.wx2_idxs """ if utool.VERBOSE: mark, end_ = utool.log_progress('[smk_index] Word IDFs: ', len(wx_series), flushfreq=500, writefreq=50, with_totaltime=True) mark(0) wx_series_values = pdh.ensure_values(wx_series) idx2_aid_values = pdh.ensure_values(idx2_aid) wx2_idxs_values = pdh.ensure_values_subset(wx2_idxs, wx_series_values) #with utool.Timer('method 1'): # 0.16s idxs_list = [pdh.ensure_values(idxs).astype(INDEX_TYPE) for idxs in wx2_idxs_values] # 11% aids_list = [idx2_aid_values.take(idxs) if len(idxs) > 0 else [] for idxs in idxs_list] nTotalDocs = len(daids) nDocsWithWord_list = [len(set(aids)) for aids in aids_list] # 68% # compute idf half of tf-idf weighting idf_list = [np.log(nTotalDocs / nDocsWithWord).astype(FLOAT_TYPE) if nDocsWithWord > 0 else 0.0 for nDocsWithWord in nDocsWithWord_list] # 17.8 ms # 13% if utool.VERBOSE: end_() if WITH_PANDAS: wx2_idf = pdh.IntSeries(idf_list, index=wx_series, name='idf') else: wx2_idf = dict(zip(wx_series_values, idf_list)) return wx2_idf
def match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid): """ >>> from ibeis.model.hots.smk.smk import * # NOQA >>> ibs, annots_df, taids, daids, qaids, nWords = testdata() >>> words = learn_visual_words(annots_df, taids, nWords) >>> invindex = index_data_annots(annots_df, daids, words) >>> qaid = qaids[0] >>> wx2_qfxs, wx2_qrvecs = compute_query_repr(annots_df, qaid, invindex) >>> daid2_totalscore = match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid) """ _daids = invindex.daids idx2_daid = invindex.idx2_daid wx2_drvecs = invindex.wx2_drvecs wx2_weight = invindex.wx2_weight daid2_gamma = invindex.daid2_gamma wx2_rvecs = wx2_qrvecs query_gamma = gamma_summation(wx2_rvecs, wx2_weight) # Accumulate scores over the entire database daid2_aggscore = pd.Series(np.zeros(len(_daids)), index=_daids, name='total_score') common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys())) daid2_wx2_scoremat = utool.ddict(lambda: utool.ddict(list)) # for each word compute the pairwise scores between matches mark, end = utool.log_progress('query word: ', len(common_wxs), flushfreq=100) for count, wx in enumerate(common_wxs): if count % 100 == 0: mark(count) # Query and database vectors for wx-th word qrvecs = wx2_qrvecs[wx] drvecs = wx2_drvecs[wx] # Word Weight weight = wx2_weight[wx] # Compute score matrix qfx2_wscore = Match_N(qrvecs, drvecs) qfx2_wscore.groupby(idx2_daid) # Group scores by database annotation ids group = qfx2_wscore.groupby(idx2_daid, axis=1) for daid, scoremat in group: daid2_wx2_scoremat[daid][wx] = scoremat #qfx2_wscore = pd.DataFrame(qfx2_wscore_, index=qfxs, columns=_idxs) daid2_wscore = weight * qfx2_wscore.sum( axis=0).groupby(idx2_daid).sum() daid2_aggscore = daid2_aggscore.add(daid2_wscore, fill_value=0) daid2_totalscore = daid2_aggscore * daid2_gamma * query_gamma end() daid_fm = {} daid_fs = {} daid_fk = {} mark, end = utool.log_progress('accumulating match info: ', len(daid2_wx2_scoremat), flushfreq=100) for count, item in enumerate(daid2_wx2_scoremat.items()): daid, wx2_scoremat = item if count % 25 == 0: mark(count) fm_accum = [] fs_accum = [] fk_accum = [] for wx, scoremat in wx2_scoremat.iteritems(): qfxs = scoremat.index dfxs = invindex.idx2_dfx[scoremat.columns] fm_ = np.vstack(np.dstack(np.meshgrid(qfxs, dfxs, indexing='ij'))) fs_ = scoremat.values.flatten() lower_thresh = 0.01 valid = [fs_ > lower_thresh] fm = fm_[valid] fs = fs_[valid] fk = np.ones(len(fm), dtype=np.int32) fm_accum.append(fm) fs_accum.append(fs) fk_accum.append(fk) daid_fm[daid] = np.vstack(fm_accum) daid_fs[daid] = np.hstack(fs_accum).T daid_fk[daid] = np.hstack(fk_accum).T chipmatch = ( daid_fm, daid_fs, daid_fk, ) daid2_totalscore.sort(axis=1, ascending=False) return daid2_totalscore, chipmatch
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose=False): """ Computes sccw normalization scalar for the database annotations. This is gamma from the SMK paper. sccw is a self consistency critiron weight --- a scalar which ensures the score of K(X, X) = 1 Args: idx2_daid (): wx2_drvecs (): wx2_aids (): wx2_idf (): wx2_dmaws (): smk_alpha (): smk_thresh (): Returns: daid2_sccw Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_index >>> from ibeis.algo.hots.smk import smk_debug >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1') >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST') >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup >>> wx2_dflags = invindex.wx2_dflags >>> ws2_idxs = invindex.wx2_idxs >>> wx2_dmaws = invindex.wx2_dmaws >>> idx2_daid = invindex.idx2_daid >>> daids = invindex.daids >>> smk_alpha = qparams.smk_alpha >>> smk_thresh = qparams.smk_thresh >>> wx2_idf = wx2_idf >>> verbose = True >>> invindex.invindex_dbgstr() >>> invindex.report_memory() >>> invindex.report_memsize() >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose) """ #for wx in wx_sublist: # print(len(wx2_dmaws verbose_ = ut.VERBOSE or verbose if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids) if not ut.QUIET: print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight') if verbose_: print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh)) mark1, end1_ = ut.log_progress( '[smk_index.sccw] SCCW group (by present words): ', len(wx2_drvecs), freq=100, with_time=WITH_TOTALTIME) # Group by daids first and then by word index # Get list of aids and rvecs w.r.t. words (ie one item per word) wx_sublist = np.array(list(wx2_drvecs.keys())) aids_perword = [wx2_aids[wx] for wx in wx_sublist] # wx_list1: Lays out word indexes for each annotation # tx_list1: Temporary within annotation subindex + wx uniquely identifies # item in wx2_drvecs, wx2_dflags, and wx2_dmaws # Flatten out indexes to perform grouping flat_aids = np.hstack(aids_perword) count = len(flat_aids) txs_perword = [np.arange(aids.size) for aids in aids_perword] flat_txs = np.hstack(txs_perword) # fromiter is faster for flat_wxs because is not a list of numpy arrays wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword)) flat_wxs = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count) # Group flat indexes by annotation id unique_aids, annot_groupxs = clustertool.group_indices(flat_aids) # Wxs and Txs grouped by annotation id wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs) txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs) # Group by word inside each annotation group wxsubgrouping_perannot = [clustertool.group_indices(wxs) for wxs in wxs_perannot] word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot) txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs) for txs, groupxs in zip(txs_perannot, word_groupxs_perannot)] wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot] # Group relavent data for sccw measure by word for each annotation grouping def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot): return [[wx2_arr[wx].take(txs, axis=0) for wx, txs in zip(wx_perword_, txs_perword_)] for wx_perword_, txs_perword_ in zip(wxs_perword_perannot, txs_perword_perannot)] def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot): return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot] subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot) subgrouped_dmaws = _vector_subgroup_by_wx(wx2_dmaws, wxs_perword_perannot, txs_perword_perannot) # If we aren't using dmaws replace it with an infinite None iterator #subgrouped_dmaws = iter(lambda: None, 1) subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot) #subgrouped_dflags = iter(lambda: None, 1) subgrouped_idfs = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot) if verbose_: end1_() mark2, end2_ = ut.log_progress(lbl='[smk_index.sccw] SCCW Sum (over daid): ', total=len(unique_aids), freq=100, with_time=WITH_TOTALTIME) progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ', total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME) else: progiter = ut.identity if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs) sccw_list = [ smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh) for rvecs_list, flags_list, maws_list, idf_list in progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs)) ] daid2_sccw = dict(zip(unique_aids, sccw_list)) if verbose_: end2_() print('[smk_index.sccw] L___ End Compute Data SCCW\n') return daid2_sccw
def compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid, idx2_fx, aggregate, verbose=False): """ Computes residual vectors based on word assignments returns mapping from word index to a set of residual vectors Args: words (ndarray): wx2_idxs (dict): wx2_maws (dict): idx2_vec (dict): idx2_aid (dict): idx2_fx (dict): aggregate (bool): verbose (bool): Returns: tuple : (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws) formatted as:: * wx2_rvecs - [ ... [ rvec_i1, ..., rvec_Mi ]_i ... ] * wx2_aids - [ ... [ aid_i1, ..., aid_Mi ]_i ... ] * wx2_fxs - [ ... [[fxs]_i1, ..., [fxs]_Mi ]_i ... ] For every word:: * list of aggvecs * For every aggvec: * one parent aid, if aggregate is False: assert isunique(aids) * list of parent fxs, if aggregate is True: assert len(fxs) == 1 Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> words = invindex.words >>> idx2_aid = invindex.idx2_daid >>> idx2_fx = invindex.idx2_dfx >>> idx2_vec = invindex.idx2_dvec >>> aggregate = ibs.cfg.query_cfg.smk_cfg.aggregate >>> wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags = compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid, idx2_fx, aggregate) """ if not ut.QUIET: print('[smk_index.rvec] +--- Start Compute Residuals') wx_sublist = np.array(wx2_idxs.keys()) # Build lists w.r.t. words idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE) for wx in wx_sublist] aids_list = [idx2_aid.take(idxs) for idxs in idxs_list] if ut.DEBUG2: #assert np.all(np.diff(wx_sublist) == 1), 'not dense' assert all([len(a) == len(b) for a, b in zip(idxs_list, aids_list)]), 'bad alignment' assert idx2_vec.shape[0] == idx2_fx.shape[0] assert idx2_vec.shape[0] == idx2_aid.shape[0] # Prealloc output if ut.VERBOSE or verbose: #print('[smk_index.rvec] Residual Vectors for %d words. aggregate=%r' % # (len(wx2_idxs), aggregate,)) lbl = '[smk_index.rvec] agg rvecs' if aggregate else '[smk_index.rvec] nonagg rvecs' mark, end_ = ut.log_progress(lbl, len(wx2_idxs), freq=50, with_time=True) if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_wx2_idxs(wx2_idxs, len(words)) # Compute Residuals rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list) if ut.VERBOSE: print('Computed size(rvecs_list) = %r' % ut.get_object_size_str(rvecs_list)) print('Computed size(flags_list) = %r' % ut.get_object_size_str(flags_list)) if aggregate: maws_list = [wx2_maws[wx] for wx in wx_sublist] # Aggregate Residuals tup = smk_residuals.compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list) (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list) = tup # Pack into common query structure aggfxs_list = [[idx2_fx.take(idxs) for idxs in aggidxs] for aggidxs in aggidxs_list] wx2_aggvecs = dict(zip(wx_sublist, aggvecs_list)) wx2_aggaids = dict(zip(wx_sublist, aggaids_list)) wx2_aggfxs = dict(zip(wx_sublist, aggfxs_list)) wx2_aggmaws = dict(zip(wx_sublist, aggmaws_list)) wx2_aggflags = dict(zip(wx_sublist, aggflags_list)) (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags) = ( wx2_aggvecs, wx2_aggaids, wx2_aggfxs, wx2_aggmaws, wx2_aggflags) else: # Hack non-aggregate residuals to have the same structure as aggregate # residuals for compatability: i.e. each rvec gets a list of fxs that # contributed to it, and for SMK this is a list of size 1 fxs_list = [[idx2_fx[idx:idx + 1] for idx in idxs] for idxs in idxs_list] wx2_rvecs = dict(zip(wx_sublist, rvecs_list)) wx2_aids = dict(zip(wx_sublist, aids_list)) wx2_fxs = dict(zip(wx_sublist, fxs_list)) wx2_flags = dict(zip(wx_sublist, flags_list)) if ut.DEBUG2: from ibeis.algo.hots.smk import smk_debug smk_debug.check_wx2(words, wx2_rvecs, wx2_aids, wx2_fxs) if ut.VERBOSE or verbose: end_() print('[smk_index.rvec] L___ End Compute Residuals') return wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags
def match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid): """ >>> from ibeis.model.hots.smk.smk import * # NOQA >>> ibs, annots_df, taids, daids, qaids, nWords = testdata() >>> words = learn_visual_words(annots_df, taids, nWords) >>> invindex = index_data_annots(annots_df, daids, words) >>> qaid = qaids[0] >>> wx2_qfxs, wx2_qrvecs = compute_query_repr(annots_df, qaid, invindex) >>> daid2_totalscore = match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid) """ _daids = invindex.daids idx2_daid = invindex.idx2_daid wx2_drvecs = invindex.wx2_drvecs wx2_weight = invindex.wx2_weight daid2_gamma = invindex.daid2_gamma wx2_rvecs = wx2_qrvecs query_gamma = gamma_summation(wx2_rvecs, wx2_weight) # Accumulate scores over the entire database daid2_aggscore = pd.Series(np.zeros(len(_daids)), index=_daids, name='total_score') common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys())) daid2_wx2_scoremat = utool.ddict(lambda: utool.ddict(list)) # for each word compute the pairwise scores between matches mark, end = utool.log_progress('query word: ', len(common_wxs), flushfreq=100) for count, wx in enumerate(common_wxs): if count % 100 == 0: mark(count) # Query and database vectors for wx-th word qrvecs = wx2_qrvecs[wx] drvecs = wx2_drvecs[wx] # Word Weight weight = wx2_weight[wx] # Compute score matrix qfx2_wscore = Match_N(qrvecs, drvecs) qfx2_wscore.groupby(idx2_daid) # Group scores by database annotation ids group = qfx2_wscore.groupby(idx2_daid, axis=1) for daid, scoremat in group: daid2_wx2_scoremat[daid][wx] = scoremat #qfx2_wscore = pd.DataFrame(qfx2_wscore_, index=qfxs, columns=_idxs) daid2_wscore = weight * qfx2_wscore.sum(axis=0).groupby(idx2_daid).sum() daid2_aggscore = daid2_aggscore.add(daid2_wscore, fill_value=0) daid2_totalscore = daid2_aggscore * daid2_gamma * query_gamma end() daid_fm = {} daid_fs = {} daid_fk = {} mark, end = utool.log_progress('accumulating match info: ', len(daid2_wx2_scoremat), flushfreq=100) for count, item in enumerate(daid2_wx2_scoremat.items()): daid, wx2_scoremat = item if count % 25 == 0: mark(count) fm_accum = [] fs_accum = [] fk_accum = [] for wx, scoremat in wx2_scoremat.iteritems(): qfxs = scoremat.index dfxs = invindex.idx2_dfx[scoremat.columns] fm_ = np.vstack(np.dstack(np.meshgrid(qfxs, dfxs, indexing='ij'))) fs_ = scoremat.values.flatten() lower_thresh = 0.01 valid = [fs_ > lower_thresh] fm = fm_[valid] fs = fs_[valid] fk = np.ones(len(fm), dtype=np.int32) fm_accum.append(fm) fs_accum.append(fs) fk_accum.append(fk) daid_fm[daid] = np.vstack(fm_accum) daid_fs[daid] = np.hstack(fs_accum).T daid_fk[daid] = np.hstack(fk_accum).T chipmatch = (daid_fm, daid_fs, daid_fk,) daid2_totalscore.sort(axis=1, ascending=False) return daid2_totalscore, chipmatch
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, alpha=3, thresh=0): """ Internals step4 Computes gamma normalization scalar for the database annotations >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_weight = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache) """ # Gropuing by aid and words wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress( '[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) # Group by daids first and then by word index for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = smk_speed.group_indicies(aids) rvecs_group = smk_speed.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress( '[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25) aid_list = list(daid2_wx2_drvecs.keys()) wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) aidwxs_list = [list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list] aidrvecs_list = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list] aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list] #gamma_list = [] #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list): # assert len(weight_list) == len(rvecs_list), 'one list for each word' # gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) # 66.8 % # #weight_list = np.ones(weight_list.size) # gamma_list.append(gamma) gamma_list = [smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)] daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') if utool.VERBOSE: end2_() return daid2_gamma
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, alpha=3, thresh=0): """ Internals step4 Computes gamma normalization scalar for the database annotations >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_weight = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache) """ # Gropuing by aid and words wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress('[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) # Group by daids first and then by word index for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = smk_speed.group_indicies(aids) rvecs_group = smk_speed.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress('[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25) aid_list = list(daid2_wx2_drvecs.keys()) wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) aidwxs_list = [ list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list ] aidrvecs_list = [ list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list ] aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list] #gamma_list = [] #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list): # assert len(weight_list) == len(rvecs_list), 'one list for each word' # gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) # 66.8 % # #weight_list = np.ones(weight_list.size) # gamma_list.append(gamma) gamma_list = [ smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh) for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list) ] daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') if utool.VERBOSE: end2_() return daid2_gamma
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, alpha=3, thresh=0): """ Computes gamma normalization scalar for the database annotations Internals step4 >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_idf = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, daids, use_cache=use_cache) """ if utool.DEBUG2: from ibeis.model.hots.smk import smk_debug smk_debug.rrr() smk_debug.check_wx2(wx2_rvecs=wx2_rvecs, wx2_aids=wx2_aids) wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress( '[smk_index] Gamma group (by word): ', len(wx_sublist), flushfreq=100, writefreq=50, with_totaltime=True) # Get list of aids and rvecs w.r.t. words aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) # Group by daids first and then by word index daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = clustertool.group_indicies(aids) rvecs_group = clustertool.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress( '[smk_index] Gamma Sum (over daid): ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25, with_totaltime=True) # Get lists w.r.t daids aid_list = list(daid2_wx2_drvecs.keys()) # list of mappings from words to rvecs foreach daid # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,] _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) _aidwxs_iter = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list) aidrvecs_list = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list] aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter] #gamma_list = [] if utool.DEBUG2: try: for count, (idf_list, rvecs_list) in enumerate(zip(aididf_list, aidrvecs_list)): assert len(idf_list) == len(rvecs_list), 'one list for each word' #gamma = smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh) except Exception as ex: utool.printex(ex) utool.embed() raise gamma_list = [smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh) for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)] if WITH_PANDAS: daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') else: daid2_gamma = dict(zip(aid_list, gamma_list)) if utool.VERBOSE: end2_() return daid2_gamma
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, alpha=3, thresh=0): """ Computes gamma normalization scalar for the database annotations Internals step4 >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2() >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh >>> idx2_daid = invindex.idx2_daid >>> wx2_idf = wx2_idf >>> daids = invindex.daids >>> use_cache = USE_CACHE_GAMMA and False >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, daids, use_cache=use_cache) """ if utool.DEBUG2: from ibeis.model.hots.smk import smk_debug smk_debug.rrr() smk_debug.check_wx2(wx2_rvecs=wx2_rvecs, wx2_aids=wx2_aids) wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs)) if utool.VERBOSE: print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh)) mark1, end1_ = utool.log_progress( '[smk_index] Gamma group (by word): ', len(wx_sublist), flushfreq=100, writefreq=50, with_totaltime=True) # Get list of aids and rvecs w.r.t. words aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist) rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist) # Group by daids first and then by word index daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list)) for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1): group_aids, groupxs = clustertool.group_indicies(aids) rvecs_group = clustertool.apply_grouping(rvecs, groupxs) # 2.9 ms for aid, rvecs_ in zip(group_aids, rvecs_group): daid2_wx2_drvecs[aid][wx] = rvecs_ if utool.VERBOSE: end1_() # For every daid, compute its gamma using pregrouped rvecs # Summation over words for each aid if utool.VERBOSE: mark2, end2_ = utool.log_progress( '[smk_index] Gamma Sum (over daid): ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25, with_totaltime=True) # Get lists w.r.t daids aid_list = list(daid2_wx2_drvecs.keys()) # list of mappings from words to rvecs foreach daid # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,] _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values()) _aidwxs_iter = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list) aidrvecs_list = [ list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list ] aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter] #gamma_list = [] if utool.DEBUG2: try: for count, (idf_list, rvecs_list) in enumerate( zip(aididf_list, aidrvecs_list)): assert len(idf_list) == len( rvecs_list), 'one list for each word' #gamma = smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh) except Exception as ex: utool.printex(ex) utool.embed() raise gamma_list = [ smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh) for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list) ] if WITH_PANDAS: daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma') else: daid2_gamma = dict(zip(aid_list, gamma_list)) if utool.VERBOSE: end2_() return daid2_gamma