Example #1
0
    def find_top_word(self, _class, topk=None):
        '''Find the keywords of given _class

    Args:
      _class: the _class the chek word
      topk: top k keywords to find, None means all words

    Returns:
      a list of topk term
    '''
        if topk:
            heap = TopkHeap(topk)
            for term in self.ciindex.get_terms():
                heap.push((self.estimate_mi(_class, term), term))
            return heap.topk()
        else:
            res = []
            for term in self.ciindex.get_terms():
                res.append((self.estimate_mi(_class, term), term))
            return sorted(res)[::-1]
Example #2
0
  def find_top_word(self, _class, topk = None):
    '''Find the keywords of given _class

    Args:
      _class: the _class the chek word
      topk: top k keywords to find, None means all words

    Returns:
      a list of topk term
    '''
    if topk:
      heap = TopkHeap(topk)
      for term in self.ciindex.get_terms():
        heap.push((self.estimate_mi(_class, term), term))
      return heap.topk()
    else:
      res = []
      for term in self.ciindex.get_terms():
        res.append((self.estimate_mi(_class, term), term))
      return sorted(res)[::-1]
Example #3
0
    def build(self):
        '''compute all terms' top pmi elements

		All terms computed is from iindex's get_terms method.
		'''

        terms = self.iindex.get_terms()
        for term in terms:
            self.term_pmi[term] = TopkHeap(self.top)
        for i in range(len(terms) - 1):
            for j in range(i + 1, len(terms)):
                pmi = self.compute_pmi(terms[i], terms[j])
                self.term_pmi[terms[i]].push(PMIElement(terms[j], pmi))
                self.term_pmi[terms[j]].push(PMIElement(terms[i], pmi))
Example #4
0
 def get_class_keywords(self, _class, top_k):
   heap = TopkHeap(top_k)
   for term in self.term_num_docs:
     heap.push((self.term_cdf[term][_class], term))
   return [item[1] for item in heap.topk()]
Example #5
0
 def top_k_appear(self, k):
   heap = TopkHeap(k)
   for term in self.get_terms():
     heap.push((self.get_word_appear(term), term))
   return heap.topk()