def compute_idf_label1(aids_list, daid2_label): """ One of our idf extensions Example: >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ >>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids) """ nWords = len(aids_list) # Computes our novel label idf weight lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values())) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] daid2_wxs = ut.ddict(list) for wx, daids in enumerate(aids_list): for daid in daids: daid2_wxs[daid].append(wx) lblindex2_daids = list(zip(unique_lblindexes, daids_list)) nLabels = len(unique_lblindexes) pcntLblsWithWord = np.zeros(nWords, np.float64) # Get num times word appears for eachlabel for lblindex, daids in lblindex2_daids: nWordsWithLabel = np.zeros(nWords) for daid in daids: wxs = daid2_wxs[daid] nWordsWithLabel[wxs] += 1 pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids)) # Labels for each word idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1), dtype=hstypes.FLOAT_TYPE), dtype=hstypes.FLOAT_TYPE) return idf_list
def compute_negentropy_names(aids_list, daid2_label): r""" One of our idf extensions Word weighting based on the negative entropy over all names of p(n_i | word) Args: aids_list (list of aids): daid2_label (dict from daid to label): Returns: negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ Math:: p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word) p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)} p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl') p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel = \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}} h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word) word_weight = log(N) - h(n | word) CommandLine: python dev.py -t smk2 --allgt --db GZ_ALL python dev.py -t smk5 --allgt --db GZ_ALL Auto: python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')" """ nWords = len(aids_list) # --- LABEL MEMBERS w.r.t daids --- # compute mapping from label to daids # Translate tuples into scalars for efficiency label_list = list(daid2_label.values()) lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list)) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] # --- DAID MEMBERS w.r.t. words --- # compute mapping from daid to word indexes # finds all the words that belong to an annotation daid2_wxs = ut.ddict(list) for wx, _daids in enumerate(aids_list): for daid in _daids: daid2_wxs[daid].append(wx) # --- \Pr(\word \given \lbl) for each label --- # Compute the number of annotations in a label with the word vs # the number of annotations in the label lblindex2_daids = list(zip(unique_lblindexes, daids_list)) # Get num times word appears for each label probWordGivenLabel_list = [] for lblindex, _daids in lblindex2_daids: nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32) for daid in _daids: wxs = np.unique(daid2_wxs[daid]) nAnnotOfLabelWithWord[wxs] += 1 probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids) probWordGivenLabel_list.append(probWordGivenLabel) # (nLabels, nWords) probWordGivenLabel_arr = np.array(probWordGivenLabel_list) # --- \Pr(\lbl \given \word) --- # compute partition function that approximates probability of a word # (1, nWords) probWord = probWordGivenLabel_arr.sum(axis=0) probWord.shape = (1, probWord.size) # (nLabels, nWords) probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord) # --- \Pr(\name \given \lbl) --- # get names for each unique label nid_list = np.array([label_list[xs[0]][0] for xs in groupxs]) unique_nids, groupxs_ = clustertool.group_indices(nid_list) # (nNames, nWords) # add a little wiggle room eps = 1E-9 # http://stackoverflow.com/questions/872544/precision-of-floating-point #epsilon = 2^(E-52) % For a 64-bit float (double precision) #epsilon = 2^(E-23) % For a 32-bit float (single precision) #epsilon = 2^(E-10) % For a 16-bit float (half precision) probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_]) logProbNameGivenWord = np.log(probNameGivenWord) wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0) # Compute negative entropy for weights nNames = len(nid_list) negentropy_list = np.log(nNames) - wordNameEntropy return negentropy_list
def compute_negentropy_names(aids_list, daid2_label): r""" One of our idf extensions Word weighting based on the negative entropy over all names of p(n_i | word) Args: aids_list (list of aids): daid2_label (dict from daid to label): Returns: negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy Example: >>> # SLOW_DOCTEST >>> from ibeis.algo.hots.smk.smk_index import * # NOQA >>> from ibeis.algo.hots.smk import smk_debug >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1() >>> wx_series = np.arange(len(invindex.words)) >>> idx2_aid = invindex.idx2_daid >>> daid2_label = invindex.daid2_label >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series) >>> idxs_list, aids_list = _ Math:: p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word) p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)} p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl') p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel = \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}} h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word) word_weight = log(N) - h(n | word) CommandLine: python dev.py -t smk2 --allgt --db GZ_ALL python dev.py -t smk5 --allgt --db GZ_ALL Auto: python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')" """ nWords = len(aids_list) # --- LABEL MEMBERS w.r.t daids --- # compute mapping from label to daids # Translate tuples into scalars for efficiency label_list = list(daid2_label.values()) lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list)) #daid2_lblindex = dict(zip(daid_list, lblindex_list)) unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list) daid_list = np.array(daid2_label.keys()) daids_list = [daid_list.take(xs) for xs in groupxs] # --- DAID MEMBERS w.r.t. words --- # compute mapping from daid to word indexes # finds all the words that belong to an annotation daid2_wxs = ut.ddict(list) for wx, _daids in enumerate(aids_list): for daid in _daids: daid2_wxs[daid].append(wx) # --- \Pr(\word \given \lbl) for each label --- # Compute the number of annotations in a label with the word vs # the number of annotations in the label lblindex2_daids = list(zip(unique_lblindexes, daids_list)) # Get num times word appears for each label probWordGivenLabel_list = [] for lblindex, _daids in lblindex2_daids: nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32) for daid in _daids: wxs = np.unique(daid2_wxs[daid]) nAnnotOfLabelWithWord[wxs] += 1 probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids) probWordGivenLabel_list.append(probWordGivenLabel) # (nLabels, nWords) probWordGivenLabel_arr = np.array(probWordGivenLabel_list) # --- \Pr(\lbl \given \word) --- # compute partition function that approximates probability of a word # (1, nWords) probWord = probWordGivenLabel_arr.sum(axis=0) probWord.shape = (1, probWord.size) # (nLabels, nWords) probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord) # --- \Pr(\name \given \lbl) --- # get names for each unique label nid_list = np.array([label_list[xs[0]][0] for xs in groupxs]) unique_nids, groupxs_ = clustertool.group_indices(nid_list) # (nNames, nWords) # add a little wiggle room eps = 1E-9 # http://stackoverflow.com/questions/872544/precision-of-floating-point #epsilon = 2^(E-52) # For a 64-bit float (double precision) #epsilon = 2^(E-23) # For a 32-bit float (single precision) #epsilon = 2^(E-10) # For a 16-bit float (half precision) probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_]) logProbNameGivenWord = np.log(probNameGivenWord) wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0) # Compute negative entropy for weights nNames = len(nid_list) negentropy_list = np.log(nNames) - wordNameEntropy return negentropy_list