def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, with_pandas=WITH_PANDAS): """ Assigns descriptor-vectors to nearest word. Returns forward and inverted index. >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> idx_name, series_name = 'idx', 'wx2_idxs' >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Find each vectors nearest word #TODO: multiple assignment _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0) #_idx2_wdist[:,0] #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)) # mutli assignment filtering as in # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf alpha = 1.2 thresh = alpha * _idx2_wdist.T[0:1].T invalid = _idx2_wdist >= thresh # Weighting as in Lost in Quantization sigma = 80 unnorm_weight = np.exp(np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma ** 2))) masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid) weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis] masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist()) idx2_wx_weights = map(utool.filter_Nones, weight.tolist()) #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid) #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis] # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2)) # The distance d_0 is used to filter asignments with distance less than # alpha * d_0 where alpha = 1.2 PANDAS_GROUP = True or with_pandas # Compute inverted index if PANDAS_GROUP: # Pandas grouping seems to be faster in this instance word_assignments = pd.DataFrame(_idx2_wx, columns=['wx']) # 141 us word_group = word_assignments.groupby('wx') # 34.5 us _wx2_idxs = word_group['wx'].indices # 8.6 us else: idx2_idx = np.arange(len(idx2_vec)) wx_list, groupxs = smk_speed.group_indicies(_idx2_wx) # 5.52 ms idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs) # 2.9 ms _wx2_idxs = dict(zip(wx_list, idxs_list)) # 753 us # if with_pandas: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d( _wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) # 274 ms 97.4 % idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx') else: if dense: wx2_idxs = { wx: _wx2_idxs[wx].astype(INDEX_TYPE) if wx in _wx2_idxs else np.empty(0, dtype=INDEX_TYPE) for wx in range(len(words)) } #wx2_idxs = _wx2_idxs #for wx in range(len(words)): # if wx not in wx2_idxs: # wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE) else: wx2_idxs = _wx2_idxs idx2_wx = _idx2_wx return wx2_idxs, idx2_wx
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, massign_alpha=1.2, massign_sigma=80): """ Assigns descriptor-vectors to nearest word. Returns inverted index, multi-assigned weights, and forward index wx2_idxs - word index -> vector indexes wx2_maws - word index -> multi-assignment weights idf2_wxs - vector index -> assigned word indexes >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, wx2_maws, idx2_wxs = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Assign each vector to the nearest visual words _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: # MultiAssignment Filtering from Improving Bag of Features # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf thresh = np.multiply(massign_alpha, _idx2_wdist.T[0:1].T) invalid = np.greater_equal(_idx2_wdist, thresh) # Weighting as in Lost in Quantization gauss_numer = -_idx2_wdist.astype(np.float64) gauss_denom = 2 * (massign_sigma ** 2) gauss_exp = np.divide(gauss_numer, gauss_denom) unnorm_maw = np.exp(gauss_exp) # Mask invalid multiassignment weights masked_unorm_maw = np.ma.masked_array(unnorm_maw, mask=invalid) # Normalize multiassignment weights from 0 to 1 masked_norm = masked_unorm_maw.sum(axis=1)[:, np.newaxis] masked_maw = np.divide(masked_unorm_maw, masked_norm) masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) # Remove masked weights and word indexes idx2_wxs = list(map(utool.filter_Nones, masked_wxs.tolist())) idx2_maws = list(map(utool.filter_Nones, masked_maw.tolist())) else: idx2_wxs = _idx2_wx.tolist() idx2_maws = [1.0] * len(idx2_wxs) # Invert mapping -- Group by word indexes jagged_idxs = ([idx] * len(wxs) for idx, wxs in enumerate(idx2_wxs)) wx_keys, groupxs = clustertool.jagged_group(idx2_wxs) idxs_list = clustertool.apply_jagged_grouping(jagged_idxs, groupxs) maws_list = clustertool.apply_jagged_grouping(idx2_maws, groupxs) wx2_idxs = dict(zip(wx_keys, idxs_list)) wx2_maws = dict(zip(wx_keys, maws_list)) if WITH_PANDAS: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d( wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) idx2_wxs = pdh.IntSeries(idx2_wxs, index=idx_series, name='wx') return wx2_idxs, wx2_maws, idx2_wxs
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, with_pandas=WITH_PANDAS): """ Assigns descriptor-vectors to nearest word. Returns forward and inverted index. >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> idx_name, series_name = 'idx', 'wx2_idxs' >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, idx2_wx = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Find each vectors nearest word #TODO: multiple assignment _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: #((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0) #_idx2_wdist[:,0] #np.sqrt(((words[_idx2_wx[:,0]].astype(np.float64) - idx2_vec_values.astype(np.float64)) ** 2).sum(axis=0)) # mutli assignment filtering as in # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf alpha = 1.2 thresh = alpha * _idx2_wdist.T[0:1].T invalid = _idx2_wdist >= thresh # Weighting as in Lost in Quantization sigma = 80 unnorm_weight = np.exp( np.divide(-_idx2_wdist.astype(np.float64), 2 * (sigma**2))) masked_weight = np.ma.masked_array(unnorm_weight, mask=invalid) weight = masked_weight / masked_weight.sum(axis=1)[:, np.newaxis] masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) idx2_wxs = map(utool.filter_Nones, masked_wxs.tolist()) idx2_wx_weights = map(utool.filter_Nones, weight.tolist()) #masked_weight1 = np.ma.masked_array(_idx2_wdist, mask=invalid) #weight1 = masked_weight1 / masked_weight1.sum(axis=1)[:, np.newaxis] # multiple assignment weight: expt(-(d ** 2) / (2 * sigma ** 2)) # The distance d_0 is used to filter asignments with distance less than # alpha * d_0 where alpha = 1.2 PANDAS_GROUP = True or with_pandas # Compute inverted index if PANDAS_GROUP: # Pandas grouping seems to be faster in this instance word_assignments = pd.DataFrame(_idx2_wx, columns=['wx']) # 141 us word_group = word_assignments.groupby('wx') # 34.5 us _wx2_idxs = word_group['wx'].indices # 8.6 us else: idx2_idx = np.arange(len(idx2_vec)) wx_list, groupxs = smk_speed.group_indicies(_idx2_wx) # 5.52 ms idxs_list = smk_speed.apply_grouping(idx2_idx, groupxs) # 2.9 ms _wx2_idxs = dict(zip(wx_list, idxs_list)) # 753 us # if with_pandas: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d(_wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) # 274 ms 97.4 % idx2_wx = pdh.IntSeries(_idx2_wx, index=idx_series, name='wx') else: if dense: wx2_idxs = { wx: _wx2_idxs[wx].astype(INDEX_TYPE) if wx in _wx2_idxs else np.empty(0, dtype=INDEX_TYPE) for wx in range(len(words)) } #wx2_idxs = _wx2_idxs #for wx in range(len(words)): # if wx not in wx2_idxs: # wx2_idxs[wx] = np.empty(0, dtype=INDEX_TYPE) else: wx2_idxs = _wx2_idxs idx2_wx = _idx2_wx return wx2_idxs, idx2_wx
def assign_to_words_(wordflann, words, idx2_vec, idx_name='idx', dense=True, nAssign=1, massign_alpha=1.2, massign_sigma=80): """ Assigns descriptor-vectors to nearest word. Returns inverted index, multi-assigned weights, and forward index wx2_idxs - word index -> vector indexes wx2_maws - word index -> multi-assignment weights idf2_wxs - vector index -> assigned word indexes >>> from ibeis.model.hots.smk.smk_index import * # NOQA >>> from ibeis.model.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex = smk_debug.testdata_raw_internals0() >>> words = invindex.words >>> wordflann = invindex.wordflann >>> idx2_vec = invindex.idx2_dvec >>> dense = True >>> nAssign = ibs.cfg.query_cfg.smk_cfg.nAssign >>> _dbargs = (wordflann, words, idx2_vec, idx_name, dense, nAssign) >>> wx2_idxs, wx2_maws, idx2_wxs = assign_to_words_(*_dbargs) """ idx2_vec_values = pdh.ensure_values(idx2_vec) # Assign each vector to the nearest visual words _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec_values, nAssign) if nAssign > 1: # MultiAssignment Filtering from Improving Bag of Features # http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf thresh = np.multiply(massign_alpha, _idx2_wdist.T[0:1].T) invalid = np.greater_equal(_idx2_wdist, thresh) # Weighting as in Lost in Quantization gauss_numer = -_idx2_wdist.astype(np.float64) gauss_denom = 2 * (massign_sigma**2) gauss_exp = np.divide(gauss_numer, gauss_denom) unnorm_maw = np.exp(gauss_exp) # Mask invalid multiassignment weights masked_unorm_maw = np.ma.masked_array(unnorm_maw, mask=invalid) # Normalize multiassignment weights from 0 to 1 masked_norm = masked_unorm_maw.sum(axis=1)[:, np.newaxis] masked_maw = np.divide(masked_unorm_maw, masked_norm) masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid) # Remove masked weights and word indexes idx2_wxs = list(map(utool.filter_Nones, masked_wxs.tolist())) idx2_maws = list(map(utool.filter_Nones, masked_maw.tolist())) else: idx2_wxs = _idx2_wx.tolist() idx2_maws = [1.0] * len(idx2_wxs) # Invert mapping -- Group by word indexes jagged_idxs = ([idx] * len(wxs) for idx, wxs in enumerate(idx2_wxs)) wx_keys, groupxs = clustertool.jagged_group(idx2_wxs) idxs_list = clustertool.apply_jagged_grouping(jagged_idxs, groupxs) maws_list = clustertool.apply_jagged_grouping(idx2_maws, groupxs) wx2_idxs = dict(zip(wx_keys, idxs_list)) wx2_maws = dict(zip(wx_keys, maws_list)) if WITH_PANDAS: idx_series = pdh.ensure_index(idx2_vec) wx_series = pdh.ensure_index(words) wx2_idxs = pdh.pandasify_dict1d(wx2_idxs, wx_series, idx_name, ('wx2_' + idx_name + 's'), dense=dense) idx2_wxs = pdh.IntSeries(idx2_wxs, index=idx_series, name='wx') return wx2_idxs, wx2_maws, idx2_wxs