def find_top_word(self, _class, topk=None): '''Find the keywords of given _class Args: _class: the _class the chek word topk: top k keywords to find, None means all words Returns: a list of topk term ''' if topk: heap = TopkHeap(topk) for term in self.ciindex.get_terms(): heap.push((self.estimate_mi(_class, term), term)) return heap.topk() else: res = [] for term in self.ciindex.get_terms(): res.append((self.estimate_mi(_class, term), term)) return sorted(res)[::-1]
def find_top_word(self, _class, topk = None): '''Find the keywords of given _class Args: _class: the _class the chek word topk: top k keywords to find, None means all words Returns: a list of topk term ''' if topk: heap = TopkHeap(topk) for term in self.ciindex.get_terms(): heap.push((self.estimate_mi(_class, term), term)) return heap.topk() else: res = [] for term in self.ciindex.get_terms(): res.append((self.estimate_mi(_class, term), term)) return sorted(res)[::-1]
def get_class_keywords(self, _class, top_k): heap = TopkHeap(top_k) for term in self.term_num_docs: heap.push((self.term_cdf[term][_class], term)) return [item[1] for item in heap.topk()]
def top_k_appear(self, k): heap = TopkHeap(k) for term in self.get_terms(): heap.push((self.get_word_appear(term), term)) return heap.topk()