def setUp(self): self.sentence=[] self.itemlist=["A","B","C","C","D","A","B","B","A","D","C","B","C","A","D","A","C","D","B","D","A","A","D"] with codecs.open("strarf_serif.txt",'rb','utf-8') as f: for line in f: self.sentence.append(line) self.sentence.append(u"EOS") self.itemlistsentence=mecabCaller.parse(self.sentence)
def judge(self, text): sum = 0 esc = [u'…', u'・', u'.', u',', u'、', u'。', u'!', u'?', u'!', u'?'] words = mecabCaller.parse(text) for word in words: if word in esc: pass elif word in self.freq: sum = self.freq[word] print words print 'sum=' + str(sum) weightedsum = sum / len(words) print 'weightedsum=' + str(weightedsum) if weightedsum >= 8: return True else: return False
def generate_unigram_model(favs): try: freqfav = cpickler.frompickle(filename='favs_model_unigram.dump') except: freqfav = {} esc = [u'…', u'・', u'.', u',', u'、', u'。', u'!', u'?', u'!', u'?'] for favlist in favs: for fav in favlist: itemlist = mecabCaller.parse(fav.text) for itemraw in itemlist: item = [x for x in itemraw if x in esc] if item in freqfav: freqfav[item] += 1 else: freqfav[item] = 1 cpickler.topickle(freqfav, filename='favs_model_unigram.dump') return freqfav