Ejemplo n.º 1
0
def OLD_compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_aids, wx2_idf, wx2_dmaws,
                           smk_alpha, smk_thresh, verbose=False):
    """
    """
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.rrr()
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)

    with ut.Timer('timer_orig1'):
        wx_sublist = np.array(wx2_drvecs.keys())
        if not ut.QUIET:
            print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
        if ut.VERBOSE or verbose:
            print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
            mark1, end1_ = ut.log_progress(
                '[smk_index.sccw] SCCW group (by present words): ', len(wx_sublist),
                freq=100, with_time=WITH_TOTALTIME)
        # Get list of aids and rvecs w.r.t. words
        aids_list   = [wx2_aids[wx] for wx in wx_sublist]
        rvecs_list1 = [wx2_drvecs[wx] for wx in wx_sublist]
        maws_list   = [wx2_dmaws[wx] for wx in wx_sublist]
        if ut.DEBUG2:
            from ibeis.algo.hots.smk import smk_debug
            smk_debug.assert_single_assigned_maws(maws_list)
        # Group by daids first and then by word index
        daid2_wx2_drvecs = clustertool.double_group(wx_sublist, aids_list, rvecs_list1)

        if ut.VERBOSE or verbose:
            end1_()

        # For every daid, compute its sccw using pregrouped rvecs
        # Summation over words for each aid
        if ut.VERBOSE or verbose:
            mark2, end2_ = ut.log_progress(
                '[smk_index.sccw] SCCW Sum (over daid): ', len(daid2_wx2_drvecs),
                freq=25, with_time=WITH_TOTALTIME)
        # Get lists w.r.t daids
        aid_list = list(daid2_wx2_drvecs.keys())
        # list of mappings from words to rvecs foreach daid
        # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,]
        _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
        _aidwxs_iter   = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list)
        aidrvecs_list  = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list]
        aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter]

    with ut.Timer('timer_orig2'):
        if ut.DEBUG2:
            from ibeis.algo.hots.smk import smk_debug
            smk_debug.check_data_smksumm(aididf_list, aidrvecs_list)
        # TODO: implement database side soft-assign
        sccw_list = [smk_scoring.sccw_summation(rvecs_list, None, idf_list, None, smk_alpha, smk_thresh)
                     for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)]

        daid2_sccw = dict(zip(aid_list, sccw_list))
    if ut.VERBOSE or verbose:
        end2_()
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')
    return daid2_sccw
Ejemplo n.º 2
0
Archivo: smk1.py Proyecto: whaozl/ibeis
def compute_data_gamma_(invindex, use_cache=True):
    """
    >>> from ibeis.model.hots.smk.smk import *  # NOQA
    >>> ibs, annots_df, taids, daids, qaids, nWords = testdata()
    >>> words = learn_visual_words(annots_df, taids, nWords)
    >>> with_internals = True
    >>> invindex = index_data_annots(annots_df, daids, words, with_internals)
    >>> daid2_gamma = compute_data_gamma_(invindex, use_cache=True)
    """
    cache_key = utool.hashstr(invindex.get_cfgstr())
    if use_cache:
        try:
            daid2_gamma = utool.global_cache_read(cache_key, appname='smk')
            #print('gamma_dbg cache hit')
            return daid2_gamma
        except Exception:
            pass

    # Gropuing by aid and words

    mark, end_ = utool.log_progress(('gamma grouping %s ' % (cache_key, )),
                                    invindex.wx2_drvecs.shape[0],
                                    flushfreq=100)
    daid2_wx2_drvecs = utool.ddict(dict)
    for count, wx in enumerate(invindex.wx2_drvecs.index):
        if count % 100 == 0:
            mark(wx)
        group = invindex.wx2_drvecs[wx].groupby(invindex.idx2_daid)
        for daid, vecs in group:
            daid2_wx2_drvecs[daid][wx] = vecs.values
    end_()

    # Summation over words for each aid
    mark, end_ = utool.log_progress('gamma summation ',
                                    len(daid2_wx2_drvecs),
                                    flushfreq=100)
    daid2_gamma = pd.Series(np.zeros(invindex.daids.shape[0]),
                            index=invindex.daids,
                            name='gamma')
    wx2_weight = invindex.wx2_weight
    for count, (daid,
                wx2_drvecs) in enumerate(six.iteritems(daid2_wx2_drvecs)):
        if count % 100 == 0:
            mark(count)
        wx2_rvecs = wx2_drvecs
        daid2_gamma[daid] = gamma_summation(wx2_rvecs, wx2_weight)
    utool.global_cache_write(cache_key, daid2_gamma, appname='smk')
    return daid2_gamma
Ejemplo n.º 3
0
def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids, daid2_label=None,
                      vocab_weighting='idf', verbose=False):
    """
    Computes the inverse-document-frequency weighting for each word

    Args:
        wx_series ():
        wx2_idxs ():
        idx2_aid ():
        daids ():
        daid2_label ():
        vocab_weighting ():

    Returns:
        wx2_idf

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids)
        >>> result = str(len(wx2_idf))
        >>> print(result)
        8000

    Ignore:
        #>>> wx2_idxs = invindex.wx2_idxs


    Auto:
        from ibeis.algo.hots.smk import smk_index
        import utool as ut; print(ut.make_default_docstr(smk_index.compute_word_idf_))

    """
    if not ut.QUIET:
        print('[smk_index.idf] +--- Start Compute IDF')
    if ut.VERBOSE or verbose:
        mark, end_ = ut.log_progress('[smk_index.idf] Word IDFs: ',
                                        len(wx_series), freq=50,
                                        with_time=WITH_TOTALTIME)

    idxs_list, aids_list = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)

    # TODO: Integrate different idf measures
    if vocab_weighting == 'idf':
        idf_list = compute_idf_orig(aids_list, daids)
    elif vocab_weighting == 'negentropy':
        assert daid2_label is not None
        idf_list = compute_idf_label1(aids_list, daid2_label)
    else:
        raise AssertionError('unknown option vocab_weighting=%r' % vocab_weighting)
    if ut.VERBOSE or verbose:
        end_()
        print('[smk_index.idf] L___ End Compute IDF')
    wx2_idf = dict(zip(wx_series, idf_list))
    return wx2_idf
Ejemplo n.º 4
0
def compute_data_gamma_(invindex, use_cache=True):
    """
    >>> from ibeis.model.hots.smk.smk import *  # NOQA
    >>> ibs, annots_df, taids, daids, qaids, nWords = testdata()
    >>> words = learn_visual_words(annots_df, taids, nWords)
    >>> with_internals = True
    >>> invindex = index_data_annots(annots_df, daids, words, with_internals)
    >>> daid2_gamma = compute_data_gamma_(invindex, use_cache=True)
    """
    cache_key = utool.hashstr(invindex.get_cfgstr())
    if use_cache:
        try:
            daid2_gamma = utool.global_cache_read(cache_key, appname='smk')
            #print('gamma_dbg cache hit')
            return daid2_gamma
        except Exception:
            pass

    # Gropuing by aid and words

    mark, end_ = utool.log_progress(('gamma grouping %s ' % (cache_key,)),
                                    invindex.wx2_drvecs.shape[0],
                                    flushfreq=100)
    daid2_wx2_drvecs = utool.ddict(dict)
    for count, wx in enumerate(invindex.wx2_drvecs.index):
        if count % 100 == 0:
            mark(wx)
        group  = invindex.wx2_drvecs[wx].groupby(invindex.idx2_daid)
        for daid, vecs in group:
            daid2_wx2_drvecs[daid][wx] = vecs.values
    end_()

    # Summation over words for each aid
    mark, end_ = utool.log_progress('gamma summation ', len(daid2_wx2_drvecs),
                                    flushfreq=100)
    daid2_gamma = pd.Series(
        np.zeros(invindex.daids.shape[0]),
        index=invindex.daids,
        name='gamma')
    wx2_weight = invindex.wx2_weight
    for count, (daid, wx2_drvecs) in enumerate(six.iteritems(daid2_wx2_drvecs)):
        if count % 100 == 0:
            mark(count)
        wx2_rvecs = wx2_drvecs
        daid2_gamma[daid] = gamma_summation(wx2_rvecs, wx2_weight)
    utool.global_cache_write(cache_key, daid2_gamma, appname='smk')
    return daid2_gamma
Ejemplo n.º 5
0
def compute_word_idf_(wx_series,
                      wx2_idxs,
                      idx2_aid,
                      daids,
                      with_pandas=WITH_PANDAS):
    """
    Returns the inverse-document-frequency weighting for each word

    internals step 2

    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs = smk_debug.testdata_raw_internals1()
    >>> wx_series = invindex.words.index
    >>> idx2_aid = invindex.idx2_daid
    >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids)
    >>> print(wx2_idf.shape)
    (8000,)

    #>>> wx2_idxs = invindex.wx2_idxs
    """
    if utool.VERBOSE:
        mark, end_ = utool.log_progress('[smk_index] Word IDFs: ',
                                        len(wx_series),
                                        flushfreq=500,
                                        writefreq=50)
        mark(0)
    wx_series_values = pdh.ensure_values(wx_series)
    idx2_aid_values = pdh.ensure_values(idx2_aid)
    wx2_idxs_values = pdh.ensure_values_subset(wx2_idxs, wx_series_values)
    #with utool.Timer('method 1'):  # 0.16s
    idxs_list = [
        pdh.ensure_values(idxs).astype(INDEX_TYPE) for idxs in wx2_idxs_values
    ]  # 11%
    aids_list = [
        idx2_aid_values.take(idxs) if len(idxs) > 0 else []
        for idxs in idxs_list
    ]
    nTotalDocs = len(daids)
    nDocsWithWord_list = [len(set(aids)) for aids in aids_list]  # 68%
    # compute idf half of tf-idf weighting
    idf_list = [
        np.log(nTotalDocs /
               nDocsWithWord).astype(FLOAT_TYPE) if nDocsWithWord > 0 else 0.0
        for nDocsWithWord in nDocsWithWord_list
    ]  # 17.8 ms   # 13%
    if utool.VERBOSE:
        end_()
    if with_pandas:
        wx2_idf = pdh.IntSeries(idf_list, index=wx_series, name='idf')
    else:
        wx2_idf = dict(zip(wx_series_values, idf_list))
    return wx2_idf
Ejemplo n.º 6
0
def match_kernel_L1(qindex, invindex, qparams):
    """ Builds up information and does verbosity before going to L0 """
    # Unpack Query
    (wx2_qrvecs, wx2_qflags, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw) = qindex
    # Unpack Database
    wx2_drvecs     = invindex.wx2_drvecs
    wx2_idf        = invindex.wx2_idf
    wx2_daid       = invindex.wx2_aids
    wx2_dflags     = invindex.wx2_dflags
    daid2_sccw     = invindex.daid2_sccw

    smk_alpha  = qparams.smk_alpha
    smk_thresh = qparams.smk_thresh

    # for each word compute the pairwise scores between matches
    common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys()))
    # Build lists over common word indexes
    qrvecs_list = [ wx2_qrvecs[wx] for wx in common_wxs]
    drvecs_list = [ wx2_drvecs[wx] for wx in common_wxs]
    daids_list  = [   wx2_daid[wx] for wx in common_wxs]
    idf_list    = [    wx2_idf[wx] for wx in common_wxs]
    qmaws_list  = [  wx2_qmaws[wx] for wx in common_wxs]  # NOQA
    dflags_list = [ wx2_dflags[wx] for wx in common_wxs]  # NOQA
    qflags_list = [ wx2_qflags[wx] for wx in common_wxs]
    dmaws_list  = None
    if utool.VERBOSE:
        mark, end_ = utool.log_progress('[smk_core] query word: ', len(common_wxs),
                                        flushfreq=100, writefreq=25,
                                        with_totaltime=True)
    #--------
    retL0 = match_kernel_L0(qrvecs_list, drvecs_list, qflags_list, dflags_list,
                            qmaws_list, dmaws_list, smk_alpha, smk_thresh,
                            idf_list, daids_list, daid2_sccw, query_sccw)
    (daid2_totalscore, scores_list, daid_agg_keys) = retL0
    #print('[smk_core] Matched %d daids' % daid2_totalscore.keys())
    #utool.embed()

    retL1 = (daid2_totalscore, common_wxs, scores_list, daids_list)
    #--------
    if utool.VERBOSE:
        end_()
        print('[smk_core] Matched %d daids. nAssign=%r' %
              (len(daid2_totalscore.keys()), qparams.nAssign))
    return retL1
Ejemplo n.º 7
0
def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids):
    """
    Returns the inverse-document-frequency weighting for each word

    internals step 2

    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs = smk_debug.testdata_raw_internals1()
    >>> wx_series = invindex.words.index
    >>> idx2_aid = invindex.idx2_daid
    >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids)
    >>> print(wx2_idf.shape)
    (8000,)

    #>>> wx2_idxs = invindex.wx2_idxs
    """
    if utool.VERBOSE:
        mark, end_ = utool.log_progress('[smk_index] Word IDFs: ',
                                        len(wx_series), flushfreq=500,
                                        writefreq=50, with_totaltime=True)
        mark(0)
    wx_series_values = pdh.ensure_values(wx_series)
    idx2_aid_values = pdh.ensure_values(idx2_aid)
    wx2_idxs_values = pdh.ensure_values_subset(wx2_idxs, wx_series_values)
    #with utool.Timer('method 1'):  # 0.16s
    idxs_list = [pdh.ensure_values(idxs).astype(INDEX_TYPE) for idxs in wx2_idxs_values]  # 11%
    aids_list = [idx2_aid_values.take(idxs) if len(idxs) > 0 else [] for idxs in idxs_list]
    nTotalDocs = len(daids)
    nDocsWithWord_list = [len(set(aids)) for aids in aids_list]  # 68%
    # compute idf half of tf-idf weighting
    idf_list = [np.log(nTotalDocs / nDocsWithWord).astype(FLOAT_TYPE)
                if nDocsWithWord > 0 else 0.0
                for nDocsWithWord in nDocsWithWord_list]  # 17.8 ms   # 13%
    if utool.VERBOSE:
        end_()
    if WITH_PANDAS:
        wx2_idf = pdh.IntSeries(idf_list, index=wx_series, name='idf')
    else:
        wx2_idf = dict(zip(wx_series_values, idf_list))
    return wx2_idf
Ejemplo n.º 8
0
Archivo: smk1.py Proyecto: whaozl/ibeis
def match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid):
    """
    >>> from ibeis.model.hots.smk.smk import *  # NOQA
    >>> ibs, annots_df, taids, daids, qaids, nWords = testdata()
    >>> words = learn_visual_words(annots_df, taids, nWords)
    >>> invindex = index_data_annots(annots_df, daids, words)
    >>> qaid = qaids[0]
    >>> wx2_qfxs, wx2_qrvecs = compute_query_repr(annots_df, qaid, invindex)
    >>> daid2_totalscore = match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid)
    """
    _daids = invindex.daids
    idx2_daid = invindex.idx2_daid
    wx2_drvecs = invindex.wx2_drvecs
    wx2_weight = invindex.wx2_weight
    daid2_gamma = invindex.daid2_gamma

    wx2_rvecs = wx2_qrvecs
    query_gamma = gamma_summation(wx2_rvecs, wx2_weight)

    # Accumulate scores over the entire database
    daid2_aggscore = pd.Series(np.zeros(len(_daids)),
                               index=_daids,
                               name='total_score')
    common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys()))

    daid2_wx2_scoremat = utool.ddict(lambda: utool.ddict(list))

    # for each word compute the pairwise scores between matches
    mark, end = utool.log_progress('query word: ',
                                   len(common_wxs),
                                   flushfreq=100)
    for count, wx in enumerate(common_wxs):
        if count % 100 == 0:
            mark(count)
        # Query and database vectors for wx-th word
        qrvecs = wx2_qrvecs[wx]
        drvecs = wx2_drvecs[wx]
        # Word Weight
        weight = wx2_weight[wx]
        # Compute score matrix
        qfx2_wscore = Match_N(qrvecs, drvecs)
        qfx2_wscore.groupby(idx2_daid)
        # Group scores by database annotation ids
        group = qfx2_wscore.groupby(idx2_daid, axis=1)
        for daid, scoremat in group:
            daid2_wx2_scoremat[daid][wx] = scoremat
        #qfx2_wscore = pd.DataFrame(qfx2_wscore_, index=qfxs, columns=_idxs)
        daid2_wscore = weight * qfx2_wscore.sum(
            axis=0).groupby(idx2_daid).sum()
        daid2_aggscore = daid2_aggscore.add(daid2_wscore, fill_value=0)
    daid2_totalscore = daid2_aggscore * daid2_gamma * query_gamma
    end()

    daid_fm = {}
    daid_fs = {}
    daid_fk = {}
    mark, end = utool.log_progress('accumulating match info: ',
                                   len(daid2_wx2_scoremat),
                                   flushfreq=100)
    for count, item in enumerate(daid2_wx2_scoremat.items()):
        daid, wx2_scoremat = item
        if count % 25 == 0:
            mark(count)
        fm_accum = []
        fs_accum = []
        fk_accum = []
        for wx, scoremat in wx2_scoremat.iteritems():
            qfxs = scoremat.index
            dfxs = invindex.idx2_dfx[scoremat.columns]
            fm_ = np.vstack(np.dstack(np.meshgrid(qfxs, dfxs, indexing='ij')))
            fs_ = scoremat.values.flatten()
            lower_thresh = 0.01
            valid = [fs_ > lower_thresh]
            fm = fm_[valid]
            fs = fs_[valid]
            fk = np.ones(len(fm), dtype=np.int32)
            fm_accum.append(fm)
            fs_accum.append(fs)
            fk_accum.append(fk)
        daid_fm[daid] = np.vstack(fm_accum)
        daid_fs[daid] = np.hstack(fs_accum).T
        daid_fk[daid] = np.hstack(fk_accum).T
    chipmatch = (
        daid_fm,
        daid_fs,
        daid_fk,
    )

    daid2_totalscore.sort(axis=1, ascending=False)
    return daid2_totalscore, chipmatch
Ejemplo n.º 9
0
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf,
                       wx2_dmaws, smk_alpha, smk_thresh, verbose=False):
    """
    Computes sccw normalization scalar for the database annotations.
    This is gamma from the SMK paper.
    sccw is a self consistency critiron weight --- a scalar which ensures
    the score of K(X, X) = 1

    Args:
        idx2_daid ():
        wx2_drvecs ():
        wx2_aids ():
        wx2_idf ():
        wx2_dmaws ():
        smk_alpha ():
        smk_thresh ():

    Returns:
        daid2_sccw

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_index
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1')
        >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST')
        >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup
        >>> wx2_dflags = invindex.wx2_dflags
        >>> ws2_idxs = invindex.wx2_idxs
        >>> wx2_dmaws  = invindex.wx2_dmaws
        >>> idx2_daid  = invindex.idx2_daid
        >>> daids      = invindex.daids
        >>> smk_alpha  = qparams.smk_alpha
        >>> smk_thresh = qparams.smk_thresh
        >>> wx2_idf    = wx2_idf
        >>> verbose = True
        >>> invindex.invindex_dbgstr()
        >>> invindex.report_memory()
        >>> invindex.report_memsize()
        >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose)
    """

    #for wx in wx_sublist:
    #    print(len(wx2_dmaws

    verbose_ = ut.VERBOSE or verbose

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
    if not ut.QUIET:
        print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
    if verbose_:
        print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
        mark1, end1_ = ut.log_progress(
            '[smk_index.sccw] SCCW group (by present words): ', len(wx2_drvecs),
            freq=100, with_time=WITH_TOTALTIME)

    # Group by daids first and then by word index
    # Get list of aids and rvecs w.r.t. words (ie one item per word)
    wx_sublist = np.array(list(wx2_drvecs.keys()))
    aids_perword  = [wx2_aids[wx] for wx in wx_sublist]

    # wx_list1: Lays out word indexes for each annotation
    # tx_list1: Temporary within annotation subindex + wx uniquely identifies
    # item in wx2_drvecs, wx2_dflags, and wx2_dmaws

    # Flatten out indexes to perform grouping
    flat_aids = np.hstack(aids_perword)
    count = len(flat_aids)
    txs_perword = [np.arange(aids.size) for aids in aids_perword]
    flat_txs  = np.hstack(txs_perword)
    # fromiter is faster for flat_wxs because is not a list of numpy arrays
    wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword))
    flat_wxs  = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count)

    # Group flat indexes by annotation id
    unique_aids, annot_groupxs = clustertool.group_indices(flat_aids)

    # Wxs and Txs grouped by annotation id
    wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs)
    txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs)

    # Group by word inside each annotation group
    wxsubgrouping_perannot = [clustertool.group_indices(wxs)
                              for wxs in wxs_perannot]
    word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot)
    txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs)
                            for txs, groupxs in
                            zip(txs_perannot, word_groupxs_perannot)]
    wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot]

    # Group relavent data for sccw measure by word for each annotation grouping

    def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot):
        return [[wx2_arr[wx].take(txs, axis=0)
                 for wx, txs in zip(wx_perword_, txs_perword_)]
                for wx_perword_, txs_perword_ in
                zip(wxs_perword_perannot, txs_perword_perannot)]

    def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot):
        return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot]

    subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot)
    subgrouped_dmaws  = _vector_subgroup_by_wx(wx2_dmaws,  wxs_perword_perannot, txs_perword_perannot)
    # If we aren't using dmaws replace it with an infinite None iterator
    #subgrouped_dmaws  = iter(lambda: None, 1)
    subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot)
    #subgrouped_dflags  = iter(lambda: None, 1)
    subgrouped_idfs   = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot)

    if verbose_:
        end1_()
        mark2, end2_ = ut.log_progress(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                        total=len(unique_aids), freq=100, with_time=WITH_TOTALTIME)
        progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                   total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME)
    else:
        progiter = ut.identity

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs)

    sccw_list = [
        smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh)
        for rvecs_list, flags_list, maws_list, idf_list in
        progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs))
    ]
    daid2_sccw = dict(zip(unique_aids, sccw_list))

    if verbose_:
        end2_()
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')

    return daid2_sccw
Ejemplo n.º 10
0
def compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid,
                       idx2_fx, aggregate, verbose=False):
    """
    Computes residual vectors based on word assignments
    returns mapping from word index to a set of residual vectors

    Args:
        words (ndarray):
        wx2_idxs (dict):
        wx2_maws (dict):
        idx2_vec (dict):
        idx2_aid (dict):
        idx2_fx (dict):
        aggregate (bool):
        verbose (bool):

    Returns:
        tuple : (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws) formatted as::
            * wx2_rvecs - [ ... [ rvec_i1, ...,  rvec_Mi ]_i ... ]
            * wx2_aids  - [ ... [  aid_i1, ...,   aid_Mi ]_i ... ]
            * wx2_fxs   - [ ... [[fxs]_i1, ..., [fxs]_Mi ]_i ... ]

        For every word::

            * list of aggvecs
            * For every aggvec:
                * one parent aid, if aggregate is False: assert isunique(aids)
                * list of parent fxs, if aggregate is True: assert len(fxs) == 1

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> words     = invindex.words
        >>> idx2_aid  = invindex.idx2_daid
        >>> idx2_fx   = invindex.idx2_dfx
        >>> idx2_vec  = invindex.idx2_dvec
        >>> aggregate = ibs.cfg.query_cfg.smk_cfg.aggregate
        >>> wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags = compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid, idx2_fx, aggregate)
    """
    if not ut.QUIET:
        print('[smk_index.rvec] +--- Start Compute Residuals')

    wx_sublist = np.array(wx2_idxs.keys())
    # Build lists w.r.t. words

    idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE) for wx in wx_sublist]
    aids_list = [idx2_aid.take(idxs) for idxs in idxs_list]
    if ut.DEBUG2:
        #assert np.all(np.diff(wx_sublist) == 1), 'not dense'
        assert all([len(a) == len(b) for a, b in zip(idxs_list, aids_list)]), 'bad alignment'
        assert idx2_vec.shape[0] == idx2_fx.shape[0]
        assert idx2_vec.shape[0] == idx2_aid.shape[0]
    # Prealloc output
    if ut.VERBOSE or verbose:
        #print('[smk_index.rvec] Residual Vectors for %d words. aggregate=%r' %
        #      (len(wx2_idxs), aggregate,))
        lbl = '[smk_index.rvec] agg rvecs' if aggregate else '[smk_index.rvec] nonagg rvecs'
        mark, end_ = ut.log_progress(lbl, len(wx2_idxs), freq=50, with_time=True)
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2_idxs(wx2_idxs, len(words))
    # Compute Residuals
    rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)

    if ut.VERBOSE:
        print('Computed size(rvecs_list) = %r' % ut.get_object_size_str(rvecs_list))
        print('Computed size(flags_list) = %r' % ut.get_object_size_str(flags_list))
    if aggregate:
        maws_list = [wx2_maws[wx] for wx in wx_sublist]
        # Aggregate Residuals
        tup = smk_residuals.compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
        (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list) = tup
        # Pack into common query structure
        aggfxs_list = [[idx2_fx.take(idxs) for idxs in aggidxs] for aggidxs in aggidxs_list]
        wx2_aggvecs  = dict(zip(wx_sublist, aggvecs_list))
        wx2_aggaids  = dict(zip(wx_sublist, aggaids_list))
        wx2_aggfxs   = dict(zip(wx_sublist, aggfxs_list))
        wx2_aggmaws  = dict(zip(wx_sublist, aggmaws_list))
        wx2_aggflags = dict(zip(wx_sublist, aggflags_list))
        (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags) = (
            wx2_aggvecs, wx2_aggaids, wx2_aggfxs, wx2_aggmaws, wx2_aggflags)
    else:
        # Hack non-aggregate residuals to have the same structure as aggregate
        # residuals for compatability: i.e. each rvec gets a list of fxs that
        # contributed to it, and for SMK this is a list of size 1
        fxs_list  = [[idx2_fx[idx:idx + 1] for idx in idxs]  for idxs in idxs_list]
        wx2_rvecs = dict(zip(wx_sublist, rvecs_list))
        wx2_aids  = dict(zip(wx_sublist, aids_list))
        wx2_fxs   = dict(zip(wx_sublist, fxs_list))
        wx2_flags = dict(zip(wx_sublist, flags_list))
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(words, wx2_rvecs, wx2_aids, wx2_fxs)
    if ut.VERBOSE or verbose:
        end_()
        print('[smk_index.rvec] L___ End Compute Residuals')
    return wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags
Ejemplo n.º 11
0
def match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid):
    """
    >>> from ibeis.model.hots.smk.smk import *  # NOQA
    >>> ibs, annots_df, taids, daids, qaids, nWords = testdata()
    >>> words = learn_visual_words(annots_df, taids, nWords)
    >>> invindex = index_data_annots(annots_df, daids, words)
    >>> qaid = qaids[0]
    >>> wx2_qfxs, wx2_qrvecs = compute_query_repr(annots_df, qaid, invindex)
    >>> daid2_totalscore = match_kernel(wx2_qrvecs, wx2_qfxs, invindex, qaid)
    """
    _daids = invindex.daids
    idx2_daid = invindex.idx2_daid
    wx2_drvecs = invindex.wx2_drvecs
    wx2_weight = invindex.wx2_weight
    daid2_gamma = invindex.daid2_gamma

    wx2_rvecs = wx2_qrvecs
    query_gamma = gamma_summation(wx2_rvecs, wx2_weight)

    # Accumulate scores over the entire database
    daid2_aggscore = pd.Series(np.zeros(len(_daids)), index=_daids, name='total_score')
    common_wxs = set(wx2_qrvecs.keys()).intersection(set(wx2_drvecs.keys()))

    daid2_wx2_scoremat = utool.ddict(lambda: utool.ddict(list))

    # for each word compute the pairwise scores between matches
    mark, end = utool.log_progress('query word: ', len(common_wxs), flushfreq=100)
    for count, wx in enumerate(common_wxs):
        if count % 100 == 0:
            mark(count)
        # Query and database vectors for wx-th word
        qrvecs = wx2_qrvecs[wx]
        drvecs = wx2_drvecs[wx]
        # Word Weight
        weight = wx2_weight[wx]
        # Compute score matrix
        qfx2_wscore = Match_N(qrvecs, drvecs)
        qfx2_wscore.groupby(idx2_daid)
        # Group scores by database annotation ids
        group = qfx2_wscore.groupby(idx2_daid, axis=1)
        for daid, scoremat in group:
            daid2_wx2_scoremat[daid][wx] = scoremat
        #qfx2_wscore = pd.DataFrame(qfx2_wscore_, index=qfxs, columns=_idxs)
        daid2_wscore = weight * qfx2_wscore.sum(axis=0).groupby(idx2_daid).sum()
        daid2_aggscore = daid2_aggscore.add(daid2_wscore, fill_value=0)
    daid2_totalscore = daid2_aggscore * daid2_gamma * query_gamma
    end()

    daid_fm = {}
    daid_fs = {}
    daid_fk = {}
    mark, end = utool.log_progress('accumulating match info: ', len(daid2_wx2_scoremat), flushfreq=100)
    for count, item in enumerate(daid2_wx2_scoremat.items()):
        daid, wx2_scoremat = item
        if count % 25 == 0:
            mark(count)
        fm_accum = []
        fs_accum = []
        fk_accum = []
        for wx, scoremat in wx2_scoremat.iteritems():
            qfxs = scoremat.index
            dfxs = invindex.idx2_dfx[scoremat.columns]
            fm_ = np.vstack(np.dstack(np.meshgrid(qfxs, dfxs, indexing='ij')))
            fs_ = scoremat.values.flatten()
            lower_thresh = 0.01
            valid = [fs_ > lower_thresh]
            fm = fm_[valid]
            fs = fs_[valid]
            fk = np.ones(len(fm), dtype=np.int32)
            fm_accum.append(fm)
            fs_accum.append(fs)
            fk_accum.append(fk)
        daid_fm[daid] = np.vstack(fm_accum)
        daid_fs[daid] = np.hstack(fs_accum).T
        daid_fk[daid] = np.hstack(fk_accum).T
    chipmatch = (daid_fm, daid_fs, daid_fk,)

    daid2_totalscore.sort(axis=1, ascending=False)
    return daid2_totalscore, chipmatch
Ejemplo n.º 12
0
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight,
                        alpha=3, thresh=0):
    """
    Internals step4

    Computes gamma normalization scalar for the database annotations
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_weight = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache)
    """
    # Gropuing by aid and words
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh))
        mark1, end1_ = utool.log_progress(
            '[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    aids_list  = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    # Group by daids first and then by word index
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = smk_speed.group_indicies(aids)
        rvecs_group = smk_speed.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress(
            '[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25)

    aid_list          = list(daid2_wx2_drvecs.keys())
    wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    aidwxs_list    = [list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list]
    aidrvecs_list  = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list]
    aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list]

    #gamma_list = []
    #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list):
    #    assert len(weight_list) == len(rvecs_list), 'one list for each word'
    #    gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)  # 66.8 %
    #    #weight_list = np.ones(weight_list.size)
    #    gamma_list.append(gamma)
    gamma_list = [smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)
                  for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)]

    daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    if utool.VERBOSE:
        end2_()
    return daid2_gamma
Ejemplo n.º 13
0
def compute_data_gamma_(idx2_daid,
                        wx2_rvecs,
                        wx2_aids,
                        wx2_weight,
                        alpha=3,
                        thresh=0):
    """
    Internals step4

    Computes gamma normalization scalar for the database annotations
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_weight = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache)
    """
    # Gropuing by aid and words
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' %
              (alpha, thresh))
        mark1, end1_ = utool.log_progress('[smk_index] Gamma Group: ',
                                          len(wx_sublist),
                                          flushfreq=100,
                                          writefreq=50)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    # Group by daids first and then by word index
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = smk_speed.group_indicies(aids)
        rvecs_group = smk_speed.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress('[smk_index] Gamma Sum: ',
                                          len(daid2_wx2_drvecs),
                                          flushfreq=100,
                                          writefreq=25)

    aid_list = list(daid2_wx2_drvecs.keys())
    wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    aidwxs_list = [
        list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list
    ]
    aidrvecs_list = [
        list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list
    ]
    aidweight_list = [[wx2_weight[wx] for wx in aidwxs]
                      for aidwxs in aidwxs_list]

    #gamma_list = []
    #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list):
    #    assert len(weight_list) == len(rvecs_list), 'one list for each word'
    #    gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)  # 66.8 %
    #    #weight_list = np.ones(weight_list.size)
    #    gamma_list.append(gamma)
    gamma_list = [
        smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)
        for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)
    ]

    daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    if utool.VERBOSE:
        end2_()
    return daid2_gamma
Ejemplo n.º 14
0
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf,
                        alpha=3, thresh=0):
    """
    Computes gamma normalization scalar for the database annotations
    Internals step4
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_idf = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, daids, use_cache=use_cache)
    """
    if utool.DEBUG2:
        from ibeis.model.hots.smk import smk_debug
        smk_debug.rrr()
        smk_debug.check_wx2(wx2_rvecs=wx2_rvecs, wx2_aids=wx2_aids)
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh))
        mark1, end1_ = utool.log_progress(
            '[smk_index] Gamma group (by word): ', len(wx_sublist),
            flushfreq=100, writefreq=50, with_totaltime=True)
    # Get list of aids and rvecs w.r.t. words
    aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    # Group by daids first and then by word index
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = clustertool.group_indicies(aids)
        rvecs_group = clustertool.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress(
            '[smk_index] Gamma Sum (over daid): ', len(daid2_wx2_drvecs),
            flushfreq=100, writefreq=25, with_totaltime=True)
    # Get lists w.r.t daids
    aid_list          = list(daid2_wx2_drvecs.keys())
    # list of mappings from words to rvecs foreach daid
    # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,]
    _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    _aidwxs_iter    = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list)
    aidrvecs_list  = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list]
    aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter]

    #gamma_list = []
    if utool.DEBUG2:
        try:
            for count, (idf_list, rvecs_list) in enumerate(zip(aididf_list, aidrvecs_list)):
                assert len(idf_list) == len(rvecs_list), 'one list for each word'
                #gamma = smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh)
        except Exception as ex:
            utool.printex(ex)
            utool.embed()
            raise
    gamma_list = [smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh)
                  for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)]

    if WITH_PANDAS:
        daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    else:
        daid2_gamma = dict(zip(aid_list, gamma_list))
    if utool.VERBOSE:
        end2_()

    return daid2_gamma
Ejemplo n.º 15
0
def compute_data_gamma_(idx2_daid,
                        wx2_rvecs,
                        wx2_aids,
                        wx2_idf,
                        alpha=3,
                        thresh=0):
    """
    Computes gamma normalization scalar for the database annotations
    Internals step4
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_idf = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_idf, daids, use_cache=use_cache)
    """
    if utool.DEBUG2:
        from ibeis.model.hots.smk import smk_debug
        smk_debug.rrr()
        smk_debug.check_wx2(wx2_rvecs=wx2_rvecs, wx2_aids=wx2_aids)
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' %
              (alpha, thresh))
        mark1, end1_ = utool.log_progress(
            '[smk_index] Gamma group (by word): ',
            len(wx_sublist),
            flushfreq=100,
            writefreq=50,
            with_totaltime=True)
    # Get list of aids and rvecs w.r.t. words
    aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    # Group by daids first and then by word index
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = clustertool.group_indicies(aids)
        rvecs_group = clustertool.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress(
            '[smk_index] Gamma Sum (over daid): ',
            len(daid2_wx2_drvecs),
            flushfreq=100,
            writefreq=25,
            with_totaltime=True)
    # Get lists w.r.t daids
    aid_list = list(daid2_wx2_drvecs.keys())
    # list of mappings from words to rvecs foreach daid
    # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,]
    _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    _aidwxs_iter = (list(wx2_aidrvecs.keys())
                    for wx2_aidrvecs in _wx2_aidrvecs_list)
    aidrvecs_list = [
        list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list
    ]
    aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter]

    #gamma_list = []
    if utool.DEBUG2:
        try:
            for count, (idf_list, rvecs_list) in enumerate(
                    zip(aididf_list, aidrvecs_list)):
                assert len(idf_list) == len(
                    rvecs_list), 'one list for each word'
                #gamma = smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh)
        except Exception as ex:
            utool.printex(ex)
            utool.embed()
            raise
    gamma_list = [
        smk_core.gamma_summation2(rvecs_list, idf_list, alpha, thresh)
        for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)
    ]

    if WITH_PANDAS:
        daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    else:
        daid2_gamma = dict(zip(aid_list, gamma_list))
    if utool.VERBOSE:
        end2_()

    return daid2_gamma