def setUp(self):
		self.sentence=[]
		self.itemlist=["A","B","C","C","D","A","B","B","A","D","C","B","C","A","D","A","C","D","B","D","A","A","D"]
		with codecs.open("strarf_serif.txt",'rb','utf-8') as f:
			for line in f:
				self.sentence.append(line)
				self.sentence.append(u"EOS")
		self.itemlistsentence=mecabCaller.parse(self.sentence)
Beispiel #2
0
 def judge(self, text):
     sum = 0
     esc = [u'…', u'・', u'.', u',', u'、', u'。', u'!', u'?', u'!', u'?']
     words = mecabCaller.parse(text)
     for word in words:
         if word in esc:
             pass
         elif word in self.freq:
             sum = self.freq[word]
     print words
     print 'sum=' + str(sum)
     weightedsum = sum / len(words)
     print 'weightedsum=' + str(weightedsum)
     if weightedsum >= 8:
         return True
     else:
         return False
def generate_unigram_model(favs):
    try:
        freqfav = cpickler.frompickle(filename='favs_model_unigram.dump')
    except:
        freqfav = {}

    esc = [u'…', u'・', u'.', u',', u'、', u'。', u'!', u'?', u'!', u'?']
    for favlist in favs:
        for fav in favlist:
            itemlist = mecabCaller.parse(fav.text)
            for itemraw in itemlist:
                item = [x for x in itemraw if x in esc]
                if item in freqfav:
                    freqfav[item] += 1
                else:
                    freqfav[item] = 1
    cpickler.topickle(freqfav, filename='favs_model_unigram.dump')

    return freqfav