def decrease_duplicate_count(self, all_words): for (key, val) in self.sort_by_keys_length(all_words): for ngrams in ngram.to_ngrams(key, len(key)): for n_gram in filter(lambda x: x in all_words, set(ngrams)): if key == n_gram: continue elif val.count == all_words[n_gram].count: all_words = self.del_word(n_gram, all_words) # else: # all_words[key].count -= all_words[n_gram].count return all_words
def test_to_ngrams(): got = ngram.to_ngrams(u'おまんこ', 4) assert got, [[u'おま', u'まん', u'んこ'], [u'おまん', u'まんこ'] == [u'おまんこ']]