Beispiel #1
0
class Guesser():
    
    def __init__(self, feed, user, config):
        import os.path

        self.user = user
        self.filename = config['bayes_dir']
        self.filename += "/users/%s" % user.id
        if not os.path.exists(self.filename):
            os.makedirs(self.filename)
        self.filename += '/feed_%s.bayes' % str(feed.id)
        log.debug("filename:%s" % self.filename)

        stopwords = meta.Session\
                .query(model.Stopword)\
                .filter_by(feed_id=feed.id).all()
        self.stopwords = map(lambda x: x.word, stopwords)

        self.trainer = Bayes()
        self.trainer.getTokens = lambda x: my_tokenize(x, self.stopwords)
        if os.path.exists(self.filename):
            self.trainer.load(self.filename)
        else:
            self.trainer.newPool('ham')
            self.trainer.newPool('spam')

    def save(self):
        self.trainer.save(self.filename)

    def clear(self):
        self.trainer = Bayes()
#        self.trainer.getTokens = my_tokenize
        self.trainer.getTokens = lambda x: my_tokenize(x, self.stopwords)
        self.trainer.newPool('ham')
        self.trainer.newPool('spam')
    
    def is_spam(self, entry, use_classified=True):
        if use_classified:
            classy = meta.Session\
                    .query(model.Classification)\
                    .filter_by(user_id = self.user.id, entry_id=entry.id).first()
            if classy:
                if classy.pool == 'spam':
                    return True
                elif classy.pool == 'ham':
                    return False
                else:
                    raise "bad pool"
                                
        g = self.guess(entry)

        if g['spam'] and not g['ham']:
            return True
            
        if not g['spam'] and g['ham']:
            return False
            
        return (g['spam'] > g['ham'])

    def guess(self, entry):
        from rssmonster.controllers.bayes import __relevant__
        
        log.debug("__relevant__(entry) %s" % __relevant__(entry))
        log.debug("__relevant__(entry) %s" % self.trainer.guess(__relevant__(entry)))
        log.debug('self.filename: %s' % self.filename)
#        ret = dict(self.trainer.guess(__relevant__(entry)))
        ret = dict(self.trainer.guess(__relevant__(entry)))
        log.debug("ret: %s" % ret)
        if not 'spam' in ret:
            ret['spam'] = None
        if not 'ham' in ret:
            ret['ham'] = None
        
        return ret