Ejemplo n.º 1
0
def compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight,
                        alpha=3, thresh=0):
    """
    Internals step4

    Computes gamma normalization scalar for the database annotations
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_weight = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache)
    """
    # Gropuing by aid and words
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' % (alpha, thresh))
        mark1, end1_ = utool.log_progress(
            '[smk_index] Gamma Group: ', len(wx_sublist), flushfreq=100, writefreq=50)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    aids_list  = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    # Group by daids first and then by word index
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = smk_speed.group_indicies(aids)
        rvecs_group = smk_speed.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress(
            '[smk_index] Gamma Sum: ', len(daid2_wx2_drvecs), flushfreq=100, writefreq=25)

    aid_list          = list(daid2_wx2_drvecs.keys())
    wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    aidwxs_list    = [list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list]
    aidrvecs_list  = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list]
    aidweight_list = [[wx2_weight[wx] for wx in aidwxs] for aidwxs in aidwxs_list]

    #gamma_list = []
    #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list):
    #    assert len(weight_list) == len(rvecs_list), 'one list for each word'
    #    gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)  # 66.8 %
    #    #weight_list = np.ones(weight_list.size)
    #    gamma_list.append(gamma)
    gamma_list = [smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)
                  for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)]

    daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    if utool.VERBOSE:
        end2_()
    return daid2_gamma
Ejemplo n.º 2
0
def compute_data_gamma_(idx2_daid,
                        wx2_rvecs,
                        wx2_aids,
                        wx2_weight,
                        alpha=3,
                        thresh=0):
    """
    Internals step4

    Computes gamma normalization scalar for the database annotations
    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_rvecs, wx2_aids = smk_debug.testdata_raw_internals2()
    >>> alpha = ibs.cfg.query_cfg.smk_cfg.alpha
    >>> thresh = ibs.cfg.query_cfg.smk_cfg.thresh
    >>> idx2_daid  = invindex.idx2_daid
    >>> wx2_weight = wx2_idf
    >>> daids      = invindex.daids
    >>> use_cache  = USE_CACHE_GAMMA and False
    >>> daid2_gamma = compute_data_gamma_(idx2_daid, wx2_rvecs, wx2_aids, wx2_weight, daids, use_cache=use_cache)
    """
    # Gropuing by aid and words
    wx_sublist = pdh.ensure_values(pdh.ensure_index(wx2_rvecs))
    if utool.VERBOSE:
        print('[smk_index] Compute Gamma alpha=%r, thresh=%r: ' %
              (alpha, thresh))
        mark1, end1_ = utool.log_progress('[smk_index] Gamma Group: ',
                                          len(wx_sublist),
                                          flushfreq=100,
                                          writefreq=50)
    rvecs_list1 = pdh.ensure_values_subset(wx2_rvecs, wx_sublist)
    aids_list = pdh.ensure_values_subset(wx2_aids, wx_sublist)
    daid2_wx2_drvecs = utool.ddict(lambda: utool.ddict(list))
    # Group by daids first and then by word index
    for wx, aids, rvecs in zip(wx_sublist, aids_list, rvecs_list1):
        group_aids, groupxs = smk_speed.group_indicies(aids)
        rvecs_group = smk_speed.apply_grouping(rvecs, groupxs)  # 2.9 ms
        for aid, rvecs_ in zip(group_aids, rvecs_group):
            daid2_wx2_drvecs[aid][wx] = rvecs_

    if utool.VERBOSE:
        end1_()

    # For every daid, compute its gamma using pregrouped rvecs
    # Summation over words for each aid
    if utool.VERBOSE:
        mark2, end2_ = utool.log_progress('[smk_index] Gamma Sum: ',
                                          len(daid2_wx2_drvecs),
                                          flushfreq=100,
                                          writefreq=25)

    aid_list = list(daid2_wx2_drvecs.keys())
    wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
    aidwxs_list = [
        list(wx2_aidrvecs.keys()) for wx2_aidrvecs in wx2_aidrvecs_list
    ]
    aidrvecs_list = [
        list(wx2_aidrvecs.values()) for wx2_aidrvecs in wx2_aidrvecs_list
    ]
    aidweight_list = [[wx2_weight[wx] for wx in aidwxs]
                      for aidwxs in aidwxs_list]

    #gamma_list = []
    #for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list):
    #    assert len(weight_list) == len(rvecs_list), 'one list for each word'
    #    gamma = smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)  # 66.8 %
    #    #weight_list = np.ones(weight_list.size)
    #    gamma_list.append(gamma)
    gamma_list = [
        smk_core.gamma_summation2(rvecs_list, weight_list, alpha, thresh)
        for weight_list, rvecs_list in zip(aidweight_list, aidrvecs_list)
    ]

    daid2_gamma = pdh.IntSeries(gamma_list, index=aid_list, name='gamma')
    if utool.VERBOSE:
        end2_()
    return daid2_gamma
Ejemplo n.º 3
0
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True,
                     nAssign=1, with_pandas=WITH_PANDAS):
    """
    Assigns descriptor-vectors to nearest word. Returns forward and inverted index.

    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0()
    >>> words  = invindex.words
    >>> wordflann = invindex.wordflann
    >>> idx2_vec  = invindex.idx2_dvec
    >>> dense = True
    >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign
    >>> idx_name, series_name = 'idx', 'wx2_idxs'
    >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign)
    >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs)
    """
    idx2_vec_values = pdh.ensure_values(idx2_vec)
    # Find each vectors nearest word
    #TODO: multiple assignment
    _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign)
    if nAssign > 1:
        #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)
        #_idx2_wdist[:,0]
        #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0))
        # mutli assignment filtering as in
        # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf
        alpha = 1.2
        thresh = alpha * _idx2_wdist.T[0:1].T
        invalid = _idx2_wdist >= thresh
        # Weighting as in Lost in Quantization
        sigma = 80
        unnorm_weight = np.exp(np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma ** 2)))
        masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid)
        weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis]
        masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
        idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist())
        idx2_wx_weights = map(utool.filter_Nones, weight.tolist())

        #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid)
        #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis]

    # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2))
    # The distance d_0 is used to filter asignments with distance less than
    # alpha * d_0 where alpha = 1.2
    PANDAS_GROUP = True or with_pandas
    # Compute inverted index
    if PANDAS_GROUP:
        # Pandas grouping seems to be faster in this instance
        word_assignments = pd.DataFrame(_idx2_wx, columns=['wx'])  # 141 us
        word_group = word_assignments.groupby('wx')  # 34.5 us
        _wx2_idxs = word_group['wx'].indices  # 8.6 us
    else:
        idx2_idx = np.arange(len(idx2_vec))
        wx_list, groupxs = smk_speed.group_indicies(_idx2_wx)  # 5.52 ms
        idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs)  # 2.9 ms
        _wx2_idxs = dict(zip(wx_list, idxs_list))  # 753 us
    #
    if with_pandas:
        idx_series = pdh.ensure_index(idx2_vec)
        wx_series  = pdh.ensure_index(words)
        wx2_idxs = pdh.pandasify_dict1d(
            _wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense)  # 274 ms 97.4 %
        idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx')
    else:
        if dense:
            wx2_idxs = {
                wx: _wx2_idxs[wx].astype(INDEX_TYPE)
                if wx in _wx2_idxs else
                np.empty(0, dtype=INDEX_TYPE)
                for wx in range(len(words))
            }
            #wx2_idxs = _wx2_idxs
            #for wx in range(len(words)):
            #    if wx not in wx2_idxs:
            #        wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE)
        else:
            wx2_idxs = _wx2_idxs
        idx2_wx  = _idx2_wx
    return wx2_idxs, idx2_wx
Ejemplo n.º 4
0
def assign_to_words_(wordflann,
                     words,
                     idx2_vec,
                     idx_name='idx',
                     dense=True,
                     nAssign=1,
                     with_pandas=WITH_PANDAS):
    """
    Assigns descriptor-vectors to nearest word. Returns forward and inverted index.

    >>> from ibeis.model.hots.smk.smk_index import *  # NOQA
    >>> from ibeis.model.hots.smk import smk_debug
    >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0()
    >>> words  = invindex.words
    >>> wordflann = invindex.wordflann
    >>> idx2_vec  = invindex.idx2_dvec
    >>> dense = True
    >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign
    >>> idx_name, series_name = 'idx', 'wx2_idxs'
    >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign)
    >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs)
    """
    idx2_vec_values = pdh.ensure_values(idx2_vec)
    # Find each vectors nearest word
    #TODO: multiple assignment
    _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign)
    if nAssign > 1:
        #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)
        #_idx2_wdist[:,0]
        #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0))
        # mutli assignment filtering as in
        # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf
        alpha = 1.2
        thresh = alpha * _idx2_wdist.T[0:1].T
        invalid = _idx2_wdist >= thresh
        # Weighting as in Lost in Quantization
        sigma = 80
        unnorm_weight = np.exp(
            np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma**2)))
        masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid)
        weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis]
        masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
        idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist())
        idx2_wx_weights = map(utool.filter_Nones, weight.tolist())

        #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid)
        #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis]

    # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2))
    # The distance d_0 is used to filter asignments with distance less than
    # alpha * d_0 where alpha = 1.2
    PANDAS_GROUP = True or with_pandas
    # Compute inverted index
    if PANDAS_GROUP:
        # Pandas grouping seems to be faster in this instance
        word_assignments = pd.DataFrame(_idx2_wx, columns=['wx'])  # 141 us
        word_group = word_assignments.groupby('wx')  # 34.5 us
        _wx2_idxs = word_group['wx'].indices  # 8.6 us
    else:
        idx2_idx = np.arange(len(idx2_vec))
        wx_list, groupxs = smk_speed.group_indicies(_idx2_wx)  # 5.52 ms
        idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs)  # 2.9 ms
        _wx2_idxs = dict(zip(wx_list, idxs_list))  # 753 us
    #
    if with_pandas:
        idx_series = pdh.ensure_index(idx2_vec)
        wx_series = pdh.ensure_index(words)
        wx2_idxs = pdh.pandasify_dict1d(_wx2_idxs,
                                        wx_series,
                                        idx_name, ('wx2_' + idx_name + 's'),
                                        dense=dense)  # 274 ms 97.4 %
        idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx')
    else:
        if dense:
            wx2_idxs = {
                wx: _wx2_idxs[wx].astype(INDEX_TYPE)
                if wx in _wx2_idxs else np.empty(0, dtype=INDEX_TYPE)
                for wx in range(len(words))
            }
            #wx2_idxs = _wx2_idxs
            #for wx in range(len(words)):
            #    if wx not in wx2_idxs:
            #        wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE)
        else:
            wx2_idxs = _wx2_idxs
        idx2_wx = _idx2_wx
    return wx2_idxs, idx2_wx