def __init__(self, bad_files_num, good_files_num, ngram_size, top_ngrams_size):
     self._bad_files_num = bad_files_num
     self._good_files_num = good_files_num
     self.top_ngrams_size = top_ngrams_size
     self.ngram_size = ngram_size
     self._files_num = bad_files_num + good_files_num
     self._good_files_proportion = good_files_num / (bad_files_num + good_files_num)
     self._bad_files_proportion = bad_files_num / (bad_files_num + good_files_num)
     self._most_informative_ngrams = SortedCollection(key=itemgetter(1))
class Classifier(object):
    """
    calculates information_gram(ig) of ngrams , and holding a sorted collection
    that holds all the ngrams with the highest igs
    """
    GOOD_FILES_PROP = 0
    BAD_FILES_PROP = 1

    def __init__(self, bad_files_num, good_files_num, ngram_size, top_ngrams_size):
        self._bad_files_num = bad_files_num
        self._good_files_num = good_files_num
        self.top_ngrams_size = top_ngrams_size
        self.ngram_size = ngram_size
        self._files_num = bad_files_num + good_files_num
        self._good_files_proportion = good_files_num / (bad_files_num + good_files_num)
        self._bad_files_proportion = bad_files_num / (bad_files_num + good_files_num)
        self._most_informative_ngrams = SortedCollection(key=itemgetter(1))

    @staticmethod
    def _calculate_element(p_vj_ci, p_vj, p_ci):
        """
        calculates one element in caclculating total ig of a ngram
        """
        if p_ci == 0 or p_vj == 0 or p_vj_ci == 0:
            return 0
        return p_vj_ci * log((p_vj_ci / (p_vj * p_ci)), 10)

    def _get_ngram_ig(self, ngram):
        """
        calculates the ig of given ngram according to ig formula.
        """
        app_in_good_files = self._calculate_element(ngram.good_prop, ngram.proportion, self._good_files_proportion)
        app_in_bad_files = self._calculate_element(ngram.bad_prop, ngram.proportion, self._bad_files_proportion)
        absence_from_good_files = self._calculate_element((1 - ngram.good_prop), (1 - ngram.proportion),
                                                          self._good_files_proportion)
        absence_from_bad_files = self._calculate_element((1 - ngram.bad_prop), (1 - ngram.proportion),
                                                         self._bad_files_proportion)
        ig = app_in_bad_files + app_in_good_files + absence_from_bad_files + absence_from_good_files
        return ig

    def _add_ngram_to_collection(self, ngram_item):
        """
        adding given ngram to the sorted collection
        """
        self._most_informative_ngrams.insert(ngram_item)
        size = self._most_informative_ngrams.get_size()

        if size > self.top_ngrams_size:
            #remove the ngram with worst ig from collection
            self._most_informative_ngrams.remove(self._most_informative_ngrams[self.top_ngrams_size])

            #pprint(list(self._most_informative_ngrams))

    def add_new_ngram(self, ngram_str, good_appearances, bad_appearances):
        """
        create a new ngram and add it to collection
        """
        good_prop = good_appearances / self._good_files_num
        bad_prop = bad_appearances / self._bad_files_num
        prop = (good_appearances + bad_appearances) / self._files_num
        ngram = Ngram(ngram_str, prop, good_prop, bad_prop)
        ngram.ig = self._get_ngram_ig(ngram)
        ngram_item = (ngram.ngram_str, 1 - ngram.ig)
        self._add_ngram_to_collection(ngram_item)

    def get_most_informative_ngrams(self):
        """
        when we finish - return ngrams with best igs
        """
        classified_ngrams = [ngram[0] for ngram in list(self._most_informative_ngrams)]
        return classified_ngrams