Esempio n. 1
0
def compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list):
    """
    Driver function for agg residual computation

    Sums and normalizes all rvecs that belong to the same word and the same
    annotation id

    Args:
        rvecs_list (list): residual vectors grouped by word
        idxs_list (list): stacked descriptor indexes grouped by word
        aids_list (list): annotation rowid for each stacked descriptor index
        maws_list (list): multi assign weights

    Returns:
        tuple : (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list)

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compute_agg_rvecs

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> from ibeis.algo.hots.smk import smk_residuals
        >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec()
        >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
        >>> tup = compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
        >>> aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list = tup
        >>> ut.assert_eq(len(wx_sublist), len(rvecs_list))

    """
    #assert len(idxs_list) == len(rvecs_list)
    # group members of each word by aid, we will collapse these groups
    grouptup_list = [clustertool.group_indices(aids) for aids in aids_list]
    # Agg aids
    aggaids_list = [tup[0] for tup in grouptup_list]
    groupxs_list = [tup[1] for tup in grouptup_list]
    # Aggregate vecs that belong to the same aid, for each word
    # (weighted aggregation with multi-assign-weights)
    aggvecs_list = [
        np.vstack([
            aggregate_rvecs(rvecs.take(xs, axis=0), maws.take(xs))
            for xs in groupxs
        ]) if len(groupxs) > 0 else np.empty(
            (0, hstypes.VEC_DIM), dtype=hstypes.FLOAT_TYPE)
        for rvecs, maws, groupxs in zip(rvecs_list, maws_list, groupxs_list)
    ]
    # Agg idxs
    aggidxs_list = [[idxs.take(xs) for xs in groupxs]
                    for idxs, groupxs in zip(idxs_list, groupxs_list)]
    aggmaws_list = [
        np.array([maws.take(xs).prod() for xs in groupxs])
        for maws, groupxs in zip(maws_list, groupxs_list)
    ]
    # Need to recompute flags for consistency
    # flag is true when aggvec is all zeros
    aggflags_list = [~np.any(aggvecs, axis=1) for aggvecs in aggvecs_list]
    return aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list
Esempio n. 2
0
def group_correspondences(all_matches, all_scores, all_daids, daid2_sccw):
    daid_keys, groupxs = clustertool.group_indices(all_daids)
    fs_list = clustertool.apply_grouping(all_scores, groupxs)
    fm_list = clustertool.apply_grouping(all_matches, groupxs)
    daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)}
    daid2_fs = {daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list)}
    # FIXME: generalize to when nAssign > 1
    daid2_fk = {daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list)}
    daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk)
    return daid2_chipmatch
Esempio n. 3
0
def compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list):
    """
    Driver function for agg residual computation

    Sums and normalizes all rvecs that belong to the same word and the same
    annotation id

    Args:
        rvecs_list (list): residual vectors grouped by word
        idxs_list (list): stacked descriptor indexes grouped by word
        aids_list (list): annotation rowid for each stacked descriptor index
        maws_list (list): multi assign weights

    Returns:
        tuple : (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list)

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compute_agg_rvecs

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> from ibeis.algo.hots.smk import smk_residuals
        >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec()
        >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
        >>> tup = compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
        >>> aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list = tup
        >>> ut.assert_eq(len(wx_sublist), len(rvecs_list))

    """
    #assert len(idxs_list) == len(rvecs_list)
    # group members of each word by aid, we will collapse these groups
    grouptup_list = [clustertool.group_indices(aids) for aids in aids_list]
    # Agg aids
    aggaids_list = [tup[0] for tup in grouptup_list]
    groupxs_list = [tup[1] for tup in grouptup_list]
    # Aggregate vecs that belong to the same aid, for each word
    # (weighted aggregation with multi-assign-weights)
    aggvecs_list = [
        np.vstack([aggregate_rvecs(rvecs.take(xs, axis=0), maws.take(xs)) for xs in groupxs])
        if len(groupxs) > 0 else
        np.empty((0, hstypes.VEC_DIM), dtype=hstypes.FLOAT_TYPE)
        for rvecs, maws, groupxs in zip(rvecs_list, maws_list, groupxs_list)]
    # Agg idxs
    aggidxs_list = [[idxs.take(xs) for xs in groupxs]
                    for idxs, groupxs in zip(idxs_list, groupxs_list)]
    aggmaws_list = [np.array([maws.take(xs).prod() for xs in groupxs])
                    for maws, groupxs in zip(maws_list, groupxs_list)]
    # Need to recompute flags for consistency
    # flag is true when aggvec is all zeros
    aggflags_list = [~np.any(aggvecs, axis=1) for aggvecs in aggvecs_list]
    return aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list
Esempio n. 4
0
def group_correspondences(all_matches, all_scores, all_daids, daid2_sccw):
    daid_keys, groupxs = clustertool.group_indices(all_daids)
    fs_list = clustertool.apply_grouping(all_scores, groupxs)
    fm_list = clustertool.apply_grouping(all_matches, groupxs)
    daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)}
    daid2_fs = {
        daid: fs * daid2_sccw[daid]
        for daid, fs in zip(daid_keys, fs_list)
    }
    # FIXME: generalize to when nAssign > 1
    daid2_fk = {
        daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE)
        for daid, fs in zip(daid_keys, fs_list)
    }
    daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk)
    return daid2_chipmatch
Esempio n. 5
0
def compute_idf_label1(aids_list, daid2_label):
    """
    One of our idf extensions

    Example:
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _
        >>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids)
    """
    nWords = len(aids_list)
    # Computes our novel label idf weight
    lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values()))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]
    daid2_wxs = ut.ddict(list)
    for wx, daids in enumerate(aids_list):
        for daid in daids:
            daid2_wxs[daid].append(wx)
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    nLabels = len(unique_lblindexes)
    pcntLblsWithWord = np.zeros(nWords, np.float64)
    # Get num times word appears for eachlabel
    for lblindex, daids in lblindex2_daids:
        nWordsWithLabel = np.zeros(nWords)
        for daid in daids:
            wxs = daid2_wxs[daid]
            nWordsWithLabel[wxs] += 1
        pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids))

    # Labels for each word
    idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1),
                                dtype=hstypes.FLOAT_TYPE),
                      dtype=hstypes.FLOAT_TYPE)
    return idf_list
Esempio n. 6
0
def compute_idf_label1(aids_list, daid2_label):
    """
    One of our idf extensions

    Example:
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _
        >>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids)
    """
    nWords = len(aids_list)
    # Computes our novel label idf weight
    lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values()))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]
    daid2_wxs = ut.ddict(list)
    for wx, daids in enumerate(aids_list):
        for daid in daids:
            daid2_wxs[daid].append(wx)
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    nLabels = len(unique_lblindexes)
    pcntLblsWithWord = np.zeros(nWords, np.float64)
    # Get num times word appears for eachlabel
    for lblindex, daids in lblindex2_daids:
        nWordsWithLabel = np.zeros(nWords)
        for daid in daids:
            wxs = daid2_wxs[daid]
            nWordsWithLabel[wxs] += 1
        pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids))

    # Labels for each word
    idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1),
                                dtype=hstypes.FLOAT_TYPE),
                      dtype=hstypes.FLOAT_TYPE)
    return idf_list
Esempio n. 7
0
def build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs,
                           scores_list, daids_list, query_sccw):
    """
    Builds explicit chipmatches that the rest of the pipeline plays nice with

    Notation:
        An explicit cmtup_old is a tuple (fm, fs, fk) feature_matches,
        feature_scores, and feature_ranks.

        Let N be the number of matches

        A feature match, fm{shape=(N, 2), dtype=int32}, is an array where the first
        column corresponds to query_feature_indexes (qfx) and the second column
        corresponds to database_feature_indexes (dfx).

        A feature score, fs{shape=(N,), dtype=float64} is an array of scores

        A feature rank, fk{shape=(N,), dtype=int16} is an array of ranks

    Returns:
        daid2_chipmatch (dict) : (daid2_fm, daid2_fs, daid2_fk)
        Return Format::
            daid2_fm (dict): {daid: fm, ...}
            daid2_fs (dict): {daid: fs, ...}
            daid2_fk (dict): {daid: fk, ...}

    Example:
        >>> from ibeis.algo.hots.smk.smk_core import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, invindex, qindex, qparams = smk_debug.testdata_match_kernel_L2()
        >>> wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw = qindex
        >>> smk_alpha = ibs.cfg.query_cfg.smk_cfg.smk_alpha
        >>> smk_thresh = ibs.cfg.query_cfg.smk_cfg.smk_thresh
        >>> withinfo = True  # takes an 11s vs 2s
        >>> args = (wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw, invindex, withinfo, smk_alpha, smk_thresh)
        >>> retL1 =  match_kernel_L1(*args)
        >>> (daid2_totalscore, common_wxs, scores_list, daids_list, idf_list, daid_agg_keys,)  = retL1
        >>> daid2_chipmatch_old = build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
        >>> daid2_chipmatch_new = build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
        >>> print(utool.is_dicteq(daid2_chipmatch_old[0], daid2_chipmatch_new[0]))
        >>> print(utool.is_dicteq(daid2_chipmatch_old[2], daid2_chipmatch_new[2]))
        >>> print(utool.is_dicteq(daid2_chipmatch_old[1],  daid2_chipmatch_new[1]))

    %timeit build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
    %timeit build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
    """
    # FIXME: move groupby to vtool
    if utool.VERBOSE:
        print('[smk_core] build cmtup_old')

    wx2_dfxs = invindex.wx2_fxs
    daid2_sccw = invindex.daid2_sccw

    qfxs_list = [wx2_qfxs[wx] for wx in common_wxs]
    dfxs_list = [wx2_dfxs[wx] for wx in common_wxs]

    shapes_list = [scores.shape for scores in scores_list]  # 51us
    shape_ranges = [(mem_arange(w), mem_arange(h))
                    for (w, h) in shapes_list]  # 230us
    ijs_list = [
        mem_meshgrid(wrange, hrange) for (wrange, hrange) in shape_ranges
    ]  # 278us
    # Normalize scores for words, nMatches, and query sccw (still need daid sccw)
    nscores_iter = (scores * query_sccw for scores in scores_list)

    # FIXME: Preflatten all of these lists
    out_ijs = [list(zip(_is.flat, _js.flat)) for (_is, _js) in ijs_list]
    out_qfxs = [[qfxs[ix] for (ix, jx) in ijs]
                for (qfxs, ijs) in zip(qfxs_list, out_ijs)]
    out_dfxs = [[dfxs[jx] for (ix, jx) in ijs]
                for (dfxs, ijs) in zip(dfxs_list, out_ijs)]
    out_daids = ([daids[jx] for (ix, jx) in ijs]
                 for (daids, ijs) in zip(daids_list, out_ijs))
    out_scores = ([nscores[ijx] for ijx in ijs]
                  for (nscores, ijs) in zip(nscores_iter, out_ijs))
    nested_fm_iter = [[
        tuple(product(qfxs_, dfxs_)) for qfxs_, dfxs_ in zip(qfxs, dfxs)
    ] for qfxs, dfxs in zip(out_qfxs, out_dfxs)]
    all_fms = np.array(list(utool.iflatten(utool.iflatten(nested_fm_iter))),
                       dtype=hstypes.FM_DTYPE)
    nested_nmatch_list = [[len(fm) for fm in fms] for fms in nested_fm_iter]
    nested_daid_iter = ([
        [daid] * nMatch for nMatch, daid in zip(nMatch_list, daids)
    ] for nMatch_list, daids in zip(nested_nmatch_list, out_daids))
    nested_score_iter = ([
        [score / nMatch] * nMatch
        for nMatch, score in zip(nMatch_list, scores)
    ] for nMatch_list, scores in zip(nested_nmatch_list, out_scores))
    all_daids_ = np.array(list(utool.iflatten(
        utool.iflatten(nested_daid_iter))),
                          dtype=hstypes.INDEX_TYPE)
    all_fss = np.array(list(utool.iflatten(utool.iflatten(nested_score_iter))),
                       dtype=hstypes.FS_DTYPE)

    # Filter out 0 scores
    keep_xs = np.where(all_fss > 0)[0]
    all_fss = all_fss.take(keep_xs)
    all_fms = all_fms.take(keep_xs, axis=0)
    all_daids_ = all_daids_.take(keep_xs)

    daid_keys, groupxs = clustertool.group_indices(all_daids_)
    fs_list = clustertool.apply_grouping(all_fss, groupxs)
    fm_list = clustertool.apply_grouping(all_fms, groupxs)
    daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)}
    daid2_fs = {
        daid: fs * daid2_sccw[daid]
        for daid, fs in zip(daid_keys, fs_list)
    }
    # FIXME: generalize to when nAssign > 1
    daid2_fk = {
        daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE)
        for daid, fs in zip(daid_keys, fs_list)
    }
    daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk)

    return daid2_chipmatch
Esempio n. 8
0
def build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs,
                           scores_list, daids_list, query_sccw):
    """
    Builds explicit chipmatches that the rest of the pipeline plays nice with

    Notation:
        An explicit cmtup_old is a tuple (fm, fs, fk) feature_matches,
        feature_scores, and feature_ranks.

        Let N be the number of matches

        A feature match, fm{shape=(N, 2), dtype=int32}, is an array where the first
        column corresponds to query_feature_indexes (qfx) and the second column
        corresponds to database_feature_indexes (dfx).

        A feature score, fs{shape=(N,), dtype=float64} is an array of scores

        A feature rank, fk{shape=(N,), dtype=int16} is an array of ranks

    Returns:
        daid2_chipmatch (dict) : (daid2_fm, daid2_fs, daid2_fk)
        Return Format::
            daid2_fm (dict): {daid: fm, ...}
            daid2_fs (dict): {daid: fs, ...}
            daid2_fk (dict): {daid: fk, ...}

    Example:
        >>> from ibeis.algo.hots.smk.smk_core import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, invindex, qindex, qparams = smk_debug.testdata_match_kernel_L2()
        >>> wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw = qindex
        >>> smk_alpha = ibs.cfg.query_cfg.smk_cfg.smk_alpha
        >>> smk_thresh = ibs.cfg.query_cfg.smk_cfg.smk_thresh
        >>> withinfo = True  # takes an 11s vs 2s
        >>> args = (wx2_qrvecs, wx2_qmaws, wx2_qaids, wx2_qfxs, query_sccw, invindex, withinfo, smk_alpha, smk_thresh)
        >>> retL1 =  match_kernel_L1(*args)
        >>> (daid2_totalscore, common_wxs, scores_list, daids_list, idf_list, daid_agg_keys,)  = retL1
        >>> daid2_chipmatch_old = build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
        >>> daid2_chipmatch_new = build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
        >>> print(utool.is_dicteq(daid2_chipmatch_old[0], daid2_chipmatch_new[0]))
        >>> print(utool.is_dicteq(daid2_chipmatch_old[2], daid2_chipmatch_new[2]))
        >>> print(utool.is_dicteq(daid2_chipmatch_old[1],  daid2_chipmatch_new[1]))

    %timeit build_daid2_chipmatch2(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
    %timeit build_daid2_chipmatch3(invindex, common_wxs, wx2_qaids, wx2_qfxs, scores_list, daids_list, query_sccw)
    """
    # FIXME: move groupby to vtool
    if utool.VERBOSE:
        print('[smk_core] build cmtup_old')

    wx2_dfxs  = invindex.wx2_fxs
    daid2_sccw = invindex.daid2_sccw

    qfxs_list = [wx2_qfxs[wx] for wx in common_wxs]
    dfxs_list = [wx2_dfxs[wx] for wx in common_wxs]

    shapes_list  = [scores.shape for scores in scores_list]  # 51us
    shape_ranges = [(mem_arange(w), mem_arange(h)) for (w, h) in shapes_list]  # 230us
    ijs_list = [mem_meshgrid(wrange, hrange) for (wrange, hrange) in shape_ranges]  # 278us
    # Normalize scores for words, nMatches, and query sccw (still need daid sccw)
    nscores_iter = (scores * query_sccw for scores in scores_list)

    # FIXME: Preflatten all of these lists
    out_ijs = [
        list(zip(_is.flat, _js.flat))
        for (_is, _js) in ijs_list
    ]
    out_qfxs = [
        [qfxs[ix] for (ix, jx) in ijs]
        for (qfxs, ijs) in zip(qfxs_list, out_ijs)
    ]
    out_dfxs = [
        [dfxs[jx] for (ix, jx) in ijs]
        for (dfxs, ijs) in zip(dfxs_list, out_ijs)
    ]
    out_daids = (
        [daids[jx] for (ix, jx) in ijs]
        for (daids, ijs) in zip(daids_list, out_ijs)
    )
    out_scores = (
        [nscores[ijx] for ijx in ijs]
        for (nscores, ijs) in zip(nscores_iter, out_ijs)
    )
    nested_fm_iter = [
        [
            tuple(product(qfxs_, dfxs_))
            for qfxs_, dfxs_ in zip(qfxs, dfxs)
        ]
        for qfxs, dfxs in zip(out_qfxs, out_dfxs)
    ]
    all_fms = np.array(list(utool.iflatten(utool.iflatten(nested_fm_iter))), dtype=hstypes.FM_DTYPE)
    nested_nmatch_list = [[len(fm) for fm in fms] for fms in nested_fm_iter]
    nested_daid_iter = (
        [
            [daid] * nMatch
            for nMatch, daid in zip(nMatch_list, daids)
        ]
        for nMatch_list, daids in zip(nested_nmatch_list, out_daids)
    )
    nested_score_iter = (
        [
            [score / nMatch] * nMatch
            for nMatch, score in zip(nMatch_list, scores)
        ]
        for nMatch_list, scores in zip(nested_nmatch_list, out_scores)
    )
    all_daids_ = np.array(list(utool.iflatten(utool.iflatten(nested_daid_iter))), dtype=hstypes.INDEX_TYPE)
    all_fss = np.array(list(utool.iflatten(utool.iflatten(nested_score_iter))), dtype=hstypes.FS_DTYPE)

    # Filter out 0 scores
    keep_xs = np.where(all_fss > 0)[0]
    all_fss = all_fss.take(keep_xs)
    all_fms = all_fms.take(keep_xs, axis=0)
    all_daids_ = all_daids_.take(keep_xs)

    daid_keys, groupxs = clustertool.group_indices(all_daids_)
    fs_list = clustertool.apply_grouping(all_fss, groupxs)
    fm_list = clustertool.apply_grouping(all_fms, groupxs)
    daid2_fm = {daid: fm for daid, fm in zip(daid_keys, fm_list)}
    daid2_fs = {daid: fs * daid2_sccw[daid] for daid, fs in zip(daid_keys, fs_list)}
    # FIXME: generalize to when nAssign > 1
    daid2_fk = {daid: np.ones(fs.size, dtype=hstypes.FK_DTYPE) for daid, fs in zip(daid_keys, fs_list)}
    daid2_chipmatch = (daid2_fm, daid2_fs, daid2_fk)

    return daid2_chipmatch
Esempio n. 9
0
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf,
                       wx2_dmaws, smk_alpha, smk_thresh, verbose=False):
    """
    Computes sccw normalization scalar for the database annotations.
    This is gamma from the SMK paper.
    sccw is a self consistency critiron weight --- a scalar which ensures
    the score of K(X, X) = 1

    Args:
        idx2_daid ():
        wx2_drvecs ():
        wx2_aids ():
        wx2_idf ():
        wx2_dmaws ():
        smk_alpha ():
        smk_thresh ():

    Returns:
        daid2_sccw

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_index
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1')
        >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST')
        >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup
        >>> wx2_dflags = invindex.wx2_dflags
        >>> ws2_idxs = invindex.wx2_idxs
        >>> wx2_dmaws  = invindex.wx2_dmaws
        >>> idx2_daid  = invindex.idx2_daid
        >>> daids      = invindex.daids
        >>> smk_alpha  = qparams.smk_alpha
        >>> smk_thresh = qparams.smk_thresh
        >>> wx2_idf    = wx2_idf
        >>> verbose = True
        >>> invindex.invindex_dbgstr()
        >>> invindex.report_memory()
        >>> invindex.report_memsize()
        >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose)
    """

    #for wx in wx_sublist:
    #    print(len(wx2_dmaws

    verbose_ = ut.VERBOSE or verbose

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
    if not ut.QUIET:
        print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
    if verbose_:
        print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
        mark1, end1_ = ut.log_progress(
            '[smk_index.sccw] SCCW group (by present words): ', len(wx2_drvecs),
            freq=100, with_time=WITH_TOTALTIME)

    # Group by daids first and then by word index
    # Get list of aids and rvecs w.r.t. words (ie one item per word)
    wx_sublist = np.array(list(wx2_drvecs.keys()))
    aids_perword  = [wx2_aids[wx] for wx in wx_sublist]

    # wx_list1: Lays out word indexes for each annotation
    # tx_list1: Temporary within annotation subindex + wx uniquely identifies
    # item in wx2_drvecs, wx2_dflags, and wx2_dmaws

    # Flatten out indexes to perform grouping
    flat_aids = np.hstack(aids_perword)
    count = len(flat_aids)
    txs_perword = [np.arange(aids.size) for aids in aids_perword]
    flat_txs  = np.hstack(txs_perword)
    # fromiter is faster for flat_wxs because is not a list of numpy arrays
    wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword))
    flat_wxs  = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count)

    # Group flat indexes by annotation id
    unique_aids, annot_groupxs = clustertool.group_indices(flat_aids)

    # Wxs and Txs grouped by annotation id
    wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs)
    txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs)

    # Group by word inside each annotation group
    wxsubgrouping_perannot = [clustertool.group_indices(wxs)
                              for wxs in wxs_perannot]
    word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot)
    txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs)
                            for txs, groupxs in
                            zip(txs_perannot, word_groupxs_perannot)]
    wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot]

    # Group relavent data for sccw measure by word for each annotation grouping

    def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot):
        return [[wx2_arr[wx].take(txs, axis=0)
                 for wx, txs in zip(wx_perword_, txs_perword_)]
                for wx_perword_, txs_perword_ in
                zip(wxs_perword_perannot, txs_perword_perannot)]

    def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot):
        return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot]

    subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot)
    subgrouped_dmaws  = _vector_subgroup_by_wx(wx2_dmaws,  wxs_perword_perannot, txs_perword_perannot)
    # If we aren't using dmaws replace it with an infinite None iterator
    #subgrouped_dmaws  = iter(lambda: None, 1)
    subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot)
    #subgrouped_dflags  = iter(lambda: None, 1)
    subgrouped_idfs   = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot)

    if verbose_:
        end1_()
        mark2, end2_ = ut.log_progress(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                        total=len(unique_aids), freq=100, with_time=WITH_TOTALTIME)
        progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                   total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME)
    else:
        progiter = ut.identity

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs)

    sccw_list = [
        smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh)
        for rvecs_list, flags_list, maws_list, idf_list in
        progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs))
    ]
    daid2_sccw = dict(zip(unique_aids, sccw_list))

    if verbose_:
        end2_()
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')

    return daid2_sccw
Esempio n. 10
0
def compute_negentropy_names(aids_list, daid2_label):
    r"""
    One of our idf extensions
    Word weighting based on the negative entropy over all names of p(n_i | word)

    Args:
        aids_list (list of aids):
        daid2_label (dict from daid to label):

    Returns:
        negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _

    Math::
        p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word)

        p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)}

        p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl')

        p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel =
        \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}}

        h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word)

        word_weight = log(N) - h(n | word)

    CommandLine:
        python dev.py -t smk2 --allgt --db GZ_ALL
        python dev.py -t smk5 --allgt --db GZ_ALL

    Auto:
        python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')"
    """
    nWords = len(aids_list)
    # --- LABEL MEMBERS w.r.t daids ---
    # compute mapping from label to daids
    # Translate tuples into scalars for efficiency
    label_list = list(daid2_label.values())
    lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]

    # --- DAID MEMBERS w.r.t. words ---
    # compute mapping from daid to word indexes
    # finds all the words that belong to an annotation
    daid2_wxs = ut.ddict(list)
    for wx, _daids in enumerate(aids_list):
        for daid in _daids:
            daid2_wxs[daid].append(wx)

    # --- \Pr(\word \given \lbl) for each label ---
    # Compute the number of annotations in a label with the word vs
    # the number of annotations in the label
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    # Get num times word appears for each label
    probWordGivenLabel_list = []
    for lblindex, _daids in lblindex2_daids:
        nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32)
        for daid in _daids:
            wxs = np.unique(daid2_wxs[daid])
            nAnnotOfLabelWithWord[wxs] += 1
        probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids)
        probWordGivenLabel_list.append(probWordGivenLabel)
    # (nLabels, nWords)
    probWordGivenLabel_arr = np.array(probWordGivenLabel_list)
    # --- \Pr(\lbl \given \word) ---
    # compute partition function that approximates probability of a word
    # (1, nWords)
    probWord = probWordGivenLabel_arr.sum(axis=0)
    probWord.shape = (1, probWord.size)
    # (nLabels, nWords)
    probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord)
    # --- \Pr(\name \given \lbl) ---
    # get names for each unique label
    nid_list = np.array([label_list[xs[0]][0] for xs in groupxs])
    unique_nids, groupxs_ = clustertool.group_indices(nid_list)
    # (nNames, nWords)
    # add a little wiggle room
    eps = 1E-9
    # http://stackoverflow.com/questions/872544/precision-of-floating-point
    #epsilon = 2^(E-52)    % For a 64-bit float (double precision)
    #epsilon = 2^(E-23)    % For a 32-bit float (single precision)
    #epsilon = 2^(E-10)    % For a 16-bit float (half precision)
    probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_])
    logProbNameGivenWord = np.log(probNameGivenWord)
    wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0)
    # Compute negative entropy for weights
    nNames = len(nid_list)
    negentropy_list = np.log(nNames) - wordNameEntropy
    return negentropy_list
Esempio n. 11
0
def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf,
                       wx2_dmaws, smk_alpha, smk_thresh, verbose=False):
    """
    Computes sccw normalization scalar for the database annotations.
    This is gamma from the SMK paper.
    sccw is a self consistency critiron weight --- a scalar which ensures
    the score of K(X, X) = 1

    Args:
        idx2_daid ():
        wx2_drvecs ():
        wx2_aids ():
        wx2_idf ():
        wx2_dmaws ():
        smk_alpha ():
        smk_thresh ():

    Returns:
        daid2_sccw

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_index
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1')
        >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST')
        >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup
        >>> wx2_dflags = invindex.wx2_dflags
        >>> ws2_idxs = invindex.wx2_idxs
        >>> wx2_dmaws  = invindex.wx2_dmaws
        >>> idx2_daid  = invindex.idx2_daid
        >>> daids      = invindex.daids
        >>> smk_alpha  = qparams.smk_alpha
        >>> smk_thresh = qparams.smk_thresh
        >>> wx2_idf    = wx2_idf
        >>> verbose = True
        >>> invindex.invindex_dbgstr()
        >>> invindex.report_memory()
        >>> invindex.report_memsize()
        >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose)
    """

    #for wx in wx_sublist:
    #    print(len(wx2_dmaws

    verbose_ = ut.VERBOSE or verbose

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
    if not ut.QUIET:
        print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
    if verbose_:
        print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))

    # Group by daids first and then by word index
    # Get list of aids and rvecs w.r.t. words (ie one item per word)
    wx_sublist = np.array(list(wx2_drvecs.keys()))
    aids_perword  = [wx2_aids[wx] for wx in wx_sublist]

    # wx_list1: Lays out word indexes for each annotation
    # tx_list1: Temporary within annotation subindex + wx uniquely identifies
    # item in wx2_drvecs, wx2_dflags, and wx2_dmaws

    # Flatten out indexes to perform grouping
    flat_aids = np.hstack(aids_perword)
    count = len(flat_aids)
    txs_perword = [np.arange(aids.size) for aids in aids_perword]
    flat_txs  = np.hstack(txs_perword)
    # fromiter is faster for flat_wxs because is not a list of numpy arrays
    wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword))
    flat_wxs  = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count)

    # Group flat indexes by annotation id
    unique_aids, annot_groupxs = clustertool.group_indices(flat_aids)

    # Wxs and Txs grouped by annotation id
    wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs)
    txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs)

    # Group by word inside each annotation group
    wxsubgrouping_perannot = [clustertool.group_indices(wxs)
                              for wxs in wxs_perannot]
    word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot)
    txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs)
                            for txs, groupxs in
                            zip(txs_perannot, word_groupxs_perannot)]
    wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot]

    # Group relavent data for sccw measure by word for each annotation grouping

    def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot):
        return [[wx2_arr[wx].take(txs, axis=0)
                 for wx, txs in zip(wx_perword_, txs_perword_)]
                for wx_perword_, txs_perword_ in
                zip(wxs_perword_perannot, txs_perword_perannot)]

    def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot):
        return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot]

    subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot)
    subgrouped_dmaws  = _vector_subgroup_by_wx(wx2_dmaws,  wxs_perword_perannot, txs_perword_perannot)
    # If we aren't using dmaws replace it with an infinite None iterator
    #subgrouped_dmaws  = iter(lambda: None, 1)
    subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot)
    #subgrouped_dflags  = iter(lambda: None, 1)
    subgrouped_idfs   = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot)

    if verbose_:
        progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                   total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME)
    else:
        progiter = ut.identity

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs)

    sccw_list = [
        smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh)
        for rvecs_list, flags_list, maws_list, idf_list in
        progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs))
    ]
    daid2_sccw = dict(zip(unique_aids, sccw_list))

    if verbose_:
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')

    return daid2_sccw
Esempio n. 12
0
def compute_negentropy_names(aids_list, daid2_label):
    r"""
    One of our idf extensions
    Word weighting based on the negative entropy over all names of p(n_i | word)

    Args:
        aids_list (list of aids):
        daid2_label (dict from daid to label):

    Returns:
        negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _

    Math::
        p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word)

        p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)}

        p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl')

        p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel =
        \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}}

        h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word)

        word_weight = log(N) - h(n | word)

    CommandLine:
        python dev.py -t smk2 --allgt --db GZ_ALL
        python dev.py -t smk5 --allgt --db GZ_ALL

    Auto:
        python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')"
    """
    nWords = len(aids_list)
    # --- LABEL MEMBERS w.r.t daids ---
    # compute mapping from label to daids
    # Translate tuples into scalars for efficiency
    label_list = list(daid2_label.values())
    lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]

    # --- DAID MEMBERS w.r.t. words ---
    # compute mapping from daid to word indexes
    # finds all the words that belong to an annotation
    daid2_wxs = ut.ddict(list)
    for wx, _daids in enumerate(aids_list):
        for daid in _daids:
            daid2_wxs[daid].append(wx)

    # --- \Pr(\word \given \lbl) for each label ---
    # Compute the number of annotations in a label with the word vs
    # the number of annotations in the label
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    # Get num times word appears for each label
    probWordGivenLabel_list = []
    for lblindex, _daids in lblindex2_daids:
        nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32)
        for daid in _daids:
            wxs = np.unique(daid2_wxs[daid])
            nAnnotOfLabelWithWord[wxs] += 1
        probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids)
        probWordGivenLabel_list.append(probWordGivenLabel)
    # (nLabels, nWords)
    probWordGivenLabel_arr = np.array(probWordGivenLabel_list)
    # --- \Pr(\lbl \given \word) ---
    # compute partition function that approximates probability of a word
    # (1, nWords)
    probWord = probWordGivenLabel_arr.sum(axis=0)
    probWord.shape = (1, probWord.size)
    # (nLabels, nWords)
    probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord)
    # --- \Pr(\name \given \lbl) ---
    # get names for each unique label
    nid_list = np.array([label_list[xs[0]][0] for xs in groupxs])
    unique_nids, groupxs_ = clustertool.group_indices(nid_list)
    # (nNames, nWords)
    # add a little wiggle room
    eps = 1E-9
    # http://stackoverflow.com/questions/872544/precision-of-floating-point
    #epsilon = 2^(E-52)    # For a 64-bit float (double precision)
    #epsilon = 2^(E-23)    # For a 32-bit float (single precision)
    #epsilon = 2^(E-10)    # For a 16-bit float (half precision)
    probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_])
    logProbNameGivenWord = np.log(probNameGivenWord)
    wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0)
    # Compute negative entropy for weights
    nNames = len(nid_list)
    negentropy_list = np.log(nNames) - wordNameEntropy
    return negentropy_list