Example #1
0
class Guesser(object):
    
    def __init__(self, project):
        self.project = project
        self.bayes = Bayes()
        self._train()
        self.data = []
        self.best = []
    
    def _train(self):
        for sentence in self.project.classified():
            self.bayes.train(sentence.get_classification(), sentence.sentence)
    
    def guess(self):
        for sentence in self.project.to_classify():
            data = {'sentence_id': sentence.id}
            data['guesses'] = self.bayes.guess(sentence.sentence)
            self.data.append(data)
        return self.data
    
    def best_matches(self):
        if not self.data: return []
        for matches in self.data:
            try:
                matches['guesses'] = sorted(matches['guesses'], key=lambda x:x[1], reverse=True)[0]
            except:
                matches['guesses'] = (None, None)
            match = {}
            match['id'] = matches['sentence_id']
            match['guess'] = matches['guesses'][0]
            match['certainty'] = matches['guesses'][1]
            self.best.append(match)
        return self.best
Example #2
0
def treinar():

    print """>>> Carregando categorias..."""
    CATEGORIAS = os.listdir('./data')
    if '.svn' in CATEGORIAS:
        CATEGORIAS.remove('.svn')


    print ">>> Instanciando treinador\n"    
    guesser = Bayes()
    try:
        for categoria in CATEGORIAS:
            print ">>> Treinando categoria %s" % categoria
            arquivos = os.listdir("%s/%s" % (CAMINHO_CATEGORIAS, categoria))
            if '.svn' in arquivos:
                arquivos.remove('.svn')
            for arquivo in arquivos:
                arquivo = open('%s/%s/%s' % (CAMINHO_CATEGORIAS, categoria, arquivo), 'r')               
                texto = arquivo.read()
                guesser.train(categoria, texto)
        print "\n>>> Salvando base de conhecimento...\n"
        guesser.save("conhecimento.bay")
        print "Voil?!\n"
    except:
        print "N?o foi poss?vel treinar a base"
Example #3
0
 def test_untrainedGuess(self):
     """
     The C{guess} method of a L{Bayes} instance with no training data returns
     an empty list.
     """
     bayes = Bayes()
     self.assertEquals(bayes.guess("hello, world"), [])
Example #4
0
 def __init__(self, non_spam_train_dir, spam_train_dir):
     self.non_spam_train_dir = non_spam_train_dir
     self.spam_train_dir = spam_train_dir
     self.naive_bayes_classifier = Bayes()
     self.total_num_train_files = 0
     self.total_num_test_files = 0
     self.num_misclass = 0
Example #5
0
 def __init__(self, parent, guesser=None, itemClass=None):
     self.status = StatusBar(parent)
     self.status.pack(side=BOTTOM, fill=X)
     Frame.__init__(self, parent)
     self.pack(side=TOP, fill=BOTH)
     self.itemsPerPage = 20
     self.rows = []
     for i in range(self.itemsPerPage):
         self.rows.append(ItemRow())
     self.items = []
     self.files = []
     self.cursor = 0
     self.dirty = False
     if guesser is None:
         from reverend.thomas import Bayes
         self.guesser = Bayes()
     else:
         self.guesser = guesser
     if itemClass is None:
         self.itemClass = TextItem
     else:
         self.itemClass = itemClass
     for row in self.rows:
         row.summary.set('foo')
     self.initViews()
Example #6
0
 def _load_guesser(self):
     if Bayes is None:
         return None
     guesser = Bayes()
     print guesser
     print dir(guesser)
     guesser.load('commands.bays')
     return guesser
Example #7
0
 def _load_guesser(self):
     if Bayes is None:
         return None
     guesser = Bayes()
     self.display(guesser)
     self.display(dir(guesser))
     guesser.load("commands.bays")
     return guesser
Example #8
0
def get_bayes(id=GLOBAL):
    if not id in guessers.keys():
        bayes = Bayes(tokenizer=statustok)
        fn = filename(id=id)
        if os.path.exists(fn):
            bayes.load(fn)
        log.debug("Created classifier for '%s' at '%s'" % (id, fn))
        guessers[id] = bayes
    return guessers[id]
Example #9
0
 def train(self, bucket, words):
     """
 Nominate a bucket to which the words apply, and train accordingly
 """
     if bucket != "" and words != "":
         try:
             Bayes.train(self, bucket, words)
             Bayes.save(self, self.brain)
         except:
             print "Failed to learn"
     else:
         return None
Example #10
0
 def train(self,bucket,words):
   """
   Nominate a bucket to which the words apply, and train accordingly
   """
   if bucket != "" and words != "":
     try:
       Bayes.train(self,bucket,words)
       Bayes.save(self,self.brain)
     except:
       print "Failed to learn"
   else:
     return None
Example #11
0
def check_junk(phrase):
    try:
        from reverend.thomas import Bayes
        g = Bayes()
        g.load("config/kikoo.bot")
        result = g.guess(phrase)
        print result
        if result:
            return int(result[0][0])
        else:
            return -1
    except:
        return -1
Example #12
0
 def __init__(self, non_spam_train_dir, spam_train_dir):
     self.non_spam_train_dir = non_spam_train_dir
     self.spam_train_dir = spam_train_dir
     self.naive_bayes_classifier = Bayes()
     self.total_num_train_files = 0
     self.total_num_test_files = 0
     self.num_misclass = 0
Example #13
0
def get_db(private_path, username):
  path = os.path.join(os.path.join(private_path, username), 'spam.bayes')
  guesser = Bayes()

  # load the spam DB
  try:
    guesser.load(path)
  except IOError:
    print "Creating a new spam filter database"

    parent_directory = os.path.dirname(path)
    if not os.path.isdir(parent_directory):
      os.makedirs(parent_directory)

    guesser.save(path)

  return guesser, path
 def untrained(self, cr, uid, ids, context=None):
     for id in ids:
         record = self.read(cr, uid, id, ['category_id','description'])
         if record['description']:
             group_obj = self.pool.get('crm.bayes.group')
             cat_obj = self.pool.get('crm.bayes.categories')
             cat_rec = cat_obj.read(cr, uid, record['category_id'][0],[])
             guesser = Bayes()
             data = ""
             for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                 if rec['train_data']:
                     data += rec['train_data']
             if data :
                 myfile = file(file_path+"crm_bayes.bay", 'w')
                 myfile.write(data)
                 myfile.close()
                 guesser.load(file_path+"crm_bayes.bay")
             guesser.untrain(cat_rec['name'],record['description'])
             guesser.save(file_path+"crm_bayes.bay")
             myfile = file(file_path+"crm_bayes.bay", 'r')
             data= ""
             for fi in myfile.readlines():
                 data += fi
             group_obj.write(cr, uid, cat_rec['group_id'][0], {'train_data': data})
             cat_obj.write(cr, uid, record['category_id'][0], {'train_messages':int(cat_rec['train_messages']) - 1 })
             cr.execute("select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"% cat_rec['group_id'][0])
             rec = cr.dictfetchall()
             if rec[0]['tot_guess']:
                 percantage = float(rec[0]['tot_guess'] *100)  / float(rec[0]['tot_guess'] + rec[0]['tot_train'])
             else :
                 percantage = 0.0
             group_obj.write(cr, uid, cat_rec['group_id'][0], {'train_data': data,'automate_test':percantage})            
             self.write(cr, uid, id, {'state_bayes':'untrained'})
     return True    
Example #15
0
    def getLanguageGuesses(self, stopWords, corpus, languages):
        from reverend.thomas import Bayes

        # charset
        charset = 'us-ascii'

        # instantiate guesser
        guesser = Bayes()

        # go through language in order to train guesser
        for selectLanguage in languages:
            if selectLanguage != 'automatic':
                stopWordString = stopWords.getStopWordString(selectLanguage)
                guesser.train(selectLanguage, stopWordString.encode(charset, 'replace'))
        
        # get list of possible languages
        languageGuesses = guesser.guess(corpus.encode(charset, 'replace'))
        
        return languageGuesses
Example #16
0
    def __init__(self, name):
        Bayes.__init__(self)

        self.brain = name + '.bay'

        try:
            Bayes.load(self, self.brain)
            print "[Bayes] Brain loaded ok"
        except:
            print "[Alert] Failed to load bayesian brain - %s, creating it now" % self.brain
            Bayes.save(self, self.brain)
            Bayes.load(self, self.brain)
Example #17
0
  def __init__(self,name):
    Bayes.__init__(self)

    self.brain = name + '.bay'

    try:
      Bayes.load(self,self.brain)
      print "[Bayes] Brain loaded ok"
    except:
      print "[Alert] Failed to load bayesian brain - %s, creating it now" % self.brain
      Bayes.save(self,self.brain)
      Bayes.load(self,self.brain)
 def guess_message(self,cr,uid,ids,context={}):
     cases = self.browse(cr, uid, ids)
     result_lang=[]
     if cases.description :
         guesser = Bayes()
         group_obj = self.pool.get('crm.bayes.group')
         data = ""
         for rec in group_obj.browse(cr, uid, group_obj.search(cr,uid,[('active','=',True)])):
             if rec['train_data']:
                 data += rec['train_data']
         if data :
             myfile = file("/tmp/crm_bayes.bay", 'w')
             myfile.write(data)
             myfile.close()
             guesser.load('/tmp/crm_bayes.bay')
             result_lang = guesser.guess(cases.description)
     guess_re = []
     for le in result_lang:
         guess_re.append((le[0],le[1]*100))
     return guess_re
Example #19
0
    def getLanguageGuesses(self, stopWords, corpus, languages):
        from reverend.thomas import Bayes

        # charset
        charset = 'us-ascii'

        # instantiate guesser
        guesser = Bayes()

        # go through language in order to train guesser
        for selectLanguage in languages:
            if selectLanguage != 'automatic':
                stopWordString = stopWords.getStopWordString(selectLanguage)
                guesser.train(selectLanguage,
                              stopWordString.encode(charset, 'replace'))

        # get list of possible languages
        languageGuesses = guesser.guess(corpus.encode(charset, 'replace'))

        return languageGuesses
Example #20
0
    def action_guess(self, cr, uid, ids, context=None):
        guesser = Bayes()
        group_obj = self.pool.get('crm.bayes.group')
        if result:
            for res in range(0, len(result)):
                result.pop(0)
        data = ""
        for rec in group_obj.browse(cr, uid, context['active_ids']):
            if rec['train_data']:
                data += rec['train_data']
        result_lang = []
        if data:
            myfile = file("/tmp/crm_bayes.bay", 'w')
            myfile.write(data)
            myfile.close()
            guesser.load('/tmp/crm_bayes.bay')
            message = self.read(cr, uid, ids, ['name'])
            result_lang = guesser.guess(message[0]['name'])

        cat_obj = self.pool.get('crm.bayes.categories')
        cat_id = cat_obj.search(cr, uid, [])
        for re in cat_obj.read(cr, uid, cat_id, ['name']):
            flag = False
            for r in result_lang:
                if r[0] == re['name']:
                    result.append(r)
                    flag = True
                    break
            if not flag:
                result.append((re['name'], 0))
        context_new = {}
        context_new.update({'from_wiz': True})
        context_new.update({'group_id': context.get('active_id', False)})
        return {
            'context': context_new,
            'view_type': 'form',
            "view_mode": 'form',
            'res_model': 'crm.bayes.test.train',
            'type': 'ir.actions.act_window',
            'target': 'new',
        }
 def action_guess(self, cr, uid, ids, context=None):
     guesser = Bayes()
     group_obj = self.pool.get('crm.bayes.group')
     if result:
         for res in range(0, len(result)):
             result.pop(0)
     data = ""
     for rec in group_obj.browse(cr, uid, context['active_ids']):
         if rec['train_data']:
             data += rec['train_data']
     result_lang=[]
     if data:
         myfile = file("/tmp/crm_bayes.bay", 'w')
         myfile.write(data)
         myfile.close()
         guesser.load('/tmp/crm_bayes.bay')
         message = self.read(cr, uid, ids, ['name'])
         result_lang = guesser.guess(message[0]['name'])
         
     cat_obj = self.pool.get('crm.bayes.categories')
     cat_id = cat_obj.search(cr, uid, [])
     for re in cat_obj.read(cr, uid, cat_id, ['name']):
         flag = False
         for r in result_lang:
             if r[0] == re['name']:
                 result.append(r)
                 flag = True
                 break
         if not flag:
             result.append((re['name'],0))
     context_new = {}
     context_new.update({'from_wiz':True})
     context_new.update({'group_id':context.get('active_id',False)})
     return {
         'context': context_new,
         'view_type': 'form', 
         "view_mode": 'form', 
         'res_model': 'crm.bayes.test.train', 
         'type': 'ir.actions.act_window', 
         'target':'new', 
      }
Example #22
0
def retrain(request):
    # Retrain your brain
    user = User.objects.get(user=request.user)
    posts = Post.objects.filter(user=user)
    bayes = Brain.objects.get(user=user)
    brain = Bayes()
    #brain.loads(base64.decodestring(bayes.data))

    tagcount = 0
    # retrain the brain based on existing tags
    for post in posts:
        print post.title, "::",
        for tag in post.tags.all():
            text = "%s %s %s" % (post.title, post.author, post.summary)
            brain.train(tag, text)
            tagcount += 1
            print tag,
        print
    brain.save('%s.db' % user)
    bayes.data = base64.encodestring(brain.saves())
    bayes.save()

    message = 'Found %s tags' % tagcount
    params = {'Messages': [message,]}
    return response(request, 'mainapp/index.html', params)
Example #23
0
def main():
    """
    Build aggregator report pages with Bayes rating links.
    """
    # Create a new Bayes guesser
    guesser = Bayes()

    # Attempt to load Bayes data, ignoring IOError on first run.
    try:
        guesser.load(BAYES_DATA_FN)
    except IOError:
        pass

    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)

    # Score the new entries using the Bayesian guesser
    entries = scoreEntries(guesser, entries)

    # Write out the current run's aggregator report.
    out_fn = time.strftime(HTML_FN)
    writeAggregatorPage(entries, out_fn, DATE_HDR_TMPL, FEED_HDR_TMPL,
                        ENTRY_TMPL, PAGE_TMPL)

    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
    guesser.save(BAYES_DATA_FN)
Example #24
0
def classificar():

    print ">>> Instanciando classificador"    
    guesser = Bayes()
    print ">>> Carregando base de conhecimento"    
    try:
        guesser.load(CAMINHO_CONHECIMENTO)
    except IOError: 
        print "Erro. Não foi possível carregar a base. Certifique-se de que existe o arquivo %s." % CAMINHO_CONHECIMENTO
        sys.exit(1)

    try:
        arquivos = os.listdir("%s/" % CAMINHO_TOCLASSIFY)
        if '.svn' in arquivos:
            arquivos.remove('.svn')
        for nome_arquivo in arquivos:
            arquivo = open('%s/%s' % (CAMINHO_TOCLASSIFY, nome_arquivo), 'r')               
            texto = arquivo.read()
            guess = guesser.guess(texto)
            print "\n>>> Arquivo %s:\n %s\n" % (nome_arquivo, guess)
    except:
        print "Erro. Não foi possível classificar."
Example #25
0
 def guess_message(self, cr, uid, ids, context={}):
     cases = self.browse(cr, uid, ids)
     result_lang = []
     if cases.description:
         guesser = Bayes()
         group_obj = self.pool.get('crm.bayes.group')
         data = ""
         for rec in group_obj.browse(
                 cr, uid, group_obj.search(cr, uid,
                                           [('active', '=', True)])):
             if rec['train_data']:
                 data += rec['train_data']
         if data:
             myfile = file("/tmp/crm_bayes.bay", 'w')
             myfile.write(data)
             myfile.close()
             guesser.load('/tmp/crm_bayes.bay')
             result_lang = guesser.guess(cases.description)
     guess_re = []
     for le in result_lang:
         guess_re.append((le[0], le[1] * 100))
     return guess_re
 def action_train(self, cr, uid, ids, context=None):
     cat_obj = self.pool.get('crm.bayes.categories')
     group_obj = self.pool.get('crm.bayes.group')
     message_obj = self.pool.get('crm.bayes.test.guess')
     
     for id in ids:
         cat_id = self.read(cr, uid, id, ['category_id','name'])         
         cat_id = cat_id[0]['category_id']
         if  result :
             max_list = max(result, key=lambda k: k[1])
             if cat_id:
                 cat_guess_msg = cat_obj.read(cr, uid, cat_id, ['train_messages'])
                 cat_obj.write(cr, uid, cat_id, {'train_messages' :cat_guess_msg['train_messages'] + 1})
             if max_list[1] > 0 and not cat_id:
                 cat_id = cat_obj.search(cr, uid, [('name','=',max_list[0])])[0]
                 cat_guess_msg = cat_obj.read(cr, uid, cat_id, ['guess_messages'])
                 cat_obj.write(cr, uid, cat_id, {'guess_messages' :cat_guess_msg['guess_messages'] + 1})
                 self.write(cr, uid, ids, {'category_id':cat_id})
         if cat_id :
             cat_rec = cat_obj.read(cr, uid, cat_id, [])
             guesser = Bayes()
             data = ""
             for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                 if rec['train_data']:
                     data += rec['train_data']
             if data :
                 myfile = file(file_path+"crm_bayes.bay", 'w')
                 myfile.write(data)
                 myfile.close()
                 guesser.load(file_path+"crm_bayes.bay")
                 
             guesser.train(cat_rec['name'], message_obj.read(cr, uid, id)[0]['name'])
             guesser.save(file_path+"crm_bayes.bay")
             myfile = file(file_path+"crm_bayes.bay", 'r')
             data=""
             for fi in myfile.readlines():
                 data += fi 
             cr.execute("select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"% cat_rec['group_id'][0])
             rec = cr.dictfetchall()
             if not rec[0]['tot_guess']:
                 rec[0]['tot_guess'] =0
             percantage = float(rec[0]['tot_guess'] *100)  / float(rec[0]['tot_guess'] + rec[0]['tot_train'])
             group_obj.write(cr, uid, cat_rec['group_id'][0], {'train_data': data,'automate_test':percantage})            
         else :
             raise osv.except_osv(_('Error !'),_('Please Select Category! '))
     return {
         'view_type': 'form', 
         "view_mode": 'form', 
         'res_model': 'crm.bayes.train.message', 
         'type': 'ir.actions.act_window', 
         'target':'new', 
      }
Example #27
0
def main():
    """
    Perform a test run of the FeedFilter using defaults.
    """
    # Create a new Bayes guesser, attempt to load data
    guesser = Bayes()
    guesser.load(BAYES_DATA_FN)

    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)

    # Build the feed filter.
    f = BayesFilter(guesser, entries)
    f.FEED_META['feed.title'] = FEED_TITLE
    f.FEED_META['feed.tagline'] = FEED_TAGLINE

    # Output the feed as both RSS and Atom.
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())

    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
Example #28
0
    def trained(self, cr, uid, ids, context=None):
        for id in ids:
            record = self.read(cr, uid, id, ['category_id', 'description'])
            if not record['description']:
                raise osv.except_osv(_('Error!'), _("Description Not Define!"))
            if not record['category_id']:
                raise osv.except_osv(_('Error!'),
                                     _("Statistics Category Not Define!"))
            group_obj = self.pool.get('crm.bayes.group')
            cat_obj = self.pool.get('crm.bayes.categories')
            cat_rec = cat_obj.read(cr, uid, record['category_id'][0], [])
            guesser = Bayes()
            data = ""
            for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                if rec['train_data']:
                    data += rec['train_data']
            if data:
                myfile = file(file_path + "crm_bayes.bay", 'w')
                myfile.write(data)
                myfile.close()
                guesser.load(file_path + "crm_bayes.bay")
            guesser.train(cat_rec['name'], record['description'])
            guesser.save(file_path + "crm_bayes.bay")
            myfile = file(file_path + "crm_bayes.bay", 'r')
            data = ""
            for fi in myfile.readlines():
                data += fi
            cat_obj.write(
                cr, uid, record['category_id'][0],
                {'train_messages': int(cat_rec['train_messages']) + 1})
            cr.execute(
                "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"
                % cat_rec['group_id'][0])
            rec = cr.dictfetchall()
            if not rec[0]['tot_guess']:
                rec[0]['tot_guess'] = 0
            percantage = float(
                rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] +
                                                   rec[0]['tot_train'])
            group_obj.write(cr, uid, cat_rec['group_id'][0], {
                'train_data': data,
                'automate_test': percantage
            })

            self.write(cr, uid, id, {'state_bayes': 'trained'})
        return True
def main():
    """
    Perform a test run of the FeedFilter using defaults.
    """
    # Create a new Bayes guesser, attempt to load data
    guesser = Bayes()
    guesser.load(BAYES_DATA_FN)
    
    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds   = [ x.strip() for x in open(FEEDS_FN, "r").readlines() ]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)
    
    # Build the feed filter.
    f = BayesFilter(guesser, entries)
    f.FEED_META['feed.title']   = FEED_TITLE
    f.FEED_META['feed.tagline'] = FEED_TAGLINE
    
    # Output the feed as both RSS and Atom.
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
    
    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
Example #30
0
    def getCategoryGuesses(self, corpus1, corpus2, corpus3):
        from reverend.thomas import Bayes

        # instantiate guesser
        guesser = Bayes()

        # train category guesser with first corpus
        guesser.train('first reference text', corpus1)
        guesser.train('second reference text', corpus2)

        # compare with second corpus
        guesses = guesser.guess(corpus3)

        return guesses
Example #31
0
def mark(request, flag):
    
    id = request.GET.get('post', None)
    feed = request.GET.get('feed', None)
    category = request.GET.get('category') 
    tag = request.GET.get('tag') or None
    
    try:
        if feed:
           posts = Post.objects.filter(feed=feed)
        else:
           posts = Post.objects.filter(id=id)
    except Post.DoesNotExist:
        return HttpResponseRedirect('/')
    
    bayes = Brain.objects.get(user=request.user) #login required
    brain = Bayes()
    brain.loads(base64.decodestring(bayes.data))
    
    if flag in ('read', 'unread'):
        flag = flag == 'read'
        posts.update(read=flag) 
    else:
        for post in posts:
            text = "%s %s %s" % (post.title, post.author, post.summary)
            t1 = Tag.objects.get(id=flag)
            if t1 in post.tags.all() and not feed:
                post.tags.remove(t1) 
                brain.untrain(t1.name, text)
            else:
                post.tags.add(t1)
                brain.train(t1.name, text)
            post.save()    
        
    bayes.data = base64.encodestring(brain.saves())
    bayes.save()
        
    if category:
       return HttpResponseRedirect('/?category=%s' % category)
    elif feed:
       return HttpResponseRedirect('/?feed=%s' % feed)
    elif tag:
       return HttpResponseRedirect('/?tag=%s' % tag)
    else:
       return HttpResponseRedirect('/')
Example #32
0
    def getCategoryGuesses(self, corpus1, corpus2, corpus3):
        from reverend.thomas import Bayes

        # instantiate guesser
        guesser = Bayes()

        # train category guesser with first corpus
        guesser.train('first reference text', corpus1)
        guesser.train('second reference text', corpus2)
    
        # compare with second corpus
        guesses = guesser.guess(corpus3)
        
        return guesses
Example #33
0
def get_db(private_path, username):
    path = os.path.join(os.path.join(private_path, username), 'spam.bayes')
    guesser = Bayes()

    # load the spam DB
    try:
        guesser.load(path)
    except IOError:
        print "Creating a new spam filter database"

        parent_directory = os.path.dirname(path)
        if not os.path.isdir(parent_directory):
            os.makedirs(parent_directory)

        guesser.save(path)

    return guesser, path
Example #34
0
    def __init__(self, feed, user, config):
        import os.path

        self.user = user
        self.filename = config['bayes_dir']
        self.filename += "/users/%s" % user.id
        if not os.path.exists(self.filename):
            os.makedirs(self.filename)
        self.filename += '/feed_%s.bayes' % str(feed.id)
        log.debug("filename:%s" % self.filename)

        stopwords = meta.Session\
                .query(model.Stopword)\
                .filter_by(feed_id=feed.id).all()
        self.stopwords = map(lambda x: x.word, stopwords)

        self.trainer = Bayes()
        self.trainer.getTokens = lambda x: my_tokenize(x, self.stopwords)
        if os.path.exists(self.filename):
            self.trainer.load(self.filename)
        else:
            self.trainer.newPool('ham')
            self.trainer.newPool('spam')
Example #35
0
def read(request, id):
    try:
        post = Post.objects.get(id=id)
        post.read = True
        post.save()

        try:
            bayes = Brain.objects.get(user=request.user) #login required
            brain = Bayes()
            brain.loads(base64.decodestring(bayes.data))
            text = post.title + ' ' + post.author + post.summary
            brain.train('Interesting', text)        
            bayes.data = base64.encodestring(brain.saves())
            bayes.save()
        except Exception, e:
            print "Couldn't train %s because %s" % (post.title, e)

        return HttpResponseRedirect(post.link)
Example #36
0
====== RESTART: /Users/raymond/Dropbox/Public/army2/decorator_school.py ======
>>> 
>>> y = big_func(10)
Doing hard work
INFO:root:Called big_func() with (10,) giving 11 in 1.074376 seconds
>>> y = big_func(20)
Doing hard work
INFO:root:Called big_func() with (20,) giving 21 in 1.100503 seconds
>>> show_cache(big_func)
{10: 11, 20: 21}
SyntaxError: invalid syntax
>>> 

>>> 
>>> from reverend.thomas import Bayes
>>> gender = Bayes()
>>> gender.train('male', 'bill hank chris mark martin pat adam hank chris zack sean')
>>> gender.train('female', 'mindy shelly pat mary daisy amber chris pat becky sue')
>>> gender.guess('hank')
[('male', 0.9999)]
>>> gender.guess('mindy')
[('female', 0.9999)]
>>> gender.guess('pat')
[('female', 0.6451612903225806), ('male', 0.35483870967741926)]
>>> gender.guess('chris')
[('male', 0.6875000000000001), ('female', 0.3125)]
>>> gender.train('male', 'red red orange yellow red orange blue black brown blue red yellow')
>>> gender.train('female', 'pink red green green blue blue chartreuse green blue yellow orange blue green')
>>> gender.guess('red')
[('male', 0.8), ('female', 0.19999999999999996)]
>>> gender.guess('pink')
Example #37
0
def brainit():
    brain = Bayes()
    data = base64.encodestring(brain.saves())
    return data
Example #38
0
)



neg_file = open(BASE_DIR+"/data/rt-polarity.neg").read()
pos_file = open(BASE_DIR+"/data/rt-polarity.pos").read()
neg_tweets_list = str(neg_file).split('\n')
pos_tweets_list = str(pos_file).split('\n')

neg_cutoff = int(neg_tweets_list.__len__()*3/4)
pos_cutoff = int(pos_tweets_list.__len__()*3/4)

neg_train = neg_tweets_list[:neg_cutoff]
pos_train = pos_tweets_list[:neg_cutoff]


neg_test = neg_tweets_list[neg_cutoff:]
pos_test = pos_tweets_list[pos_cutoff:]
tweet_data = {'neg_train':neg_train,'pos_train':pos_train,'neg_test':neg_test,'pos_test':pos_test}



bestwords = get_best_words(pos_train, neg_train)
single_classifier = Bayes()
single_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_classifier.dat")
non_stop_classifier = Bayes(tokenizer=non_stop_tokenizer())
non_stop_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_stop_classifier.dat")
best_classifier = Bayes(tokenizer=best_tokenizer(best_words=bestwords))
best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_best_classifier.dat")
bigram_best_classifier = Bayes(tokenizer=best_bigram_tokenizer(best_words=bestwords))
bigram_best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_bi_classifier.dat")
Example #39
0
 def __init__(self):
     self.guesser = Bayes()
Example #40
0
class BayesianClassifier:

    POSITIVE = POSITIVE
    NEGATIVE = NEGATIVE
    NEUTRAL = NEUTRAL

    THRESHHOLD = 0.1
    guesser = None

    def __init__(self):
        self.guesser = Bayes()

    def train(self, example_tweets):
        for t in example_tweets:
            self.guesser.train(t.sentiment, t.text)

        self.guesser.train(POSITIVE, "cool")
        self.guesser.train(POSITIVE, "Woo")
        self.guesser.train(POSITIVE, "quite amazing")
        self.guesser.train(POSITIVE, "thks")
        self.guesser.train(POSITIVE, "looking forward to")
        self.guesser.train(POSITIVE, "damn good")
        self.guesser.train(POSITIVE, "frickin ruled")
        self.guesser.train(POSITIVE, "frickin rules")
        self.guesser.train(POSITIVE, "Way to go")
        self.guesser.train(POSITIVE, "cute")
        self.guesser.train(POSITIVE, "comeback")
        self.guesser.train(POSITIVE, "not suck")
        self.guesser.train(POSITIVE, "prop")
        self.guesser.train(POSITIVE, "kinda impressed")
        self.guesser.train(POSITIVE, "props")
        self.guesser.train(POSITIVE, "come on")
        self.guesser.train(POSITIVE, "congratulation")
        self.guesser.train(POSITIVE, "gtd")
        self.guesser.train(POSITIVE, "proud")
        self.guesser.train(POSITIVE, "thanks")
        self.guesser.train(POSITIVE, "can help")
        self.guesser.train(POSITIVE, "thanks!")
        self.guesser.train(POSITIVE, "pumped")
        self.guesser.train(POSITIVE, "integrate")
        self.guesser.train(POSITIVE, "really like")
        self.guesser.train(POSITIVE, "loves it")
        self.guesser.train(POSITIVE, "yay")
        self.guesser.train(POSITIVE, "amazing")
        self.guesser.train(POSITIVE, "epic flail")
        self.guesser.train(POSITIVE, "flail")
        self.guesser.train(POSITIVE, "good luck")
        self.guesser.train(POSITIVE, "fail")
        self.guesser.train(POSITIVE, "life saver")
        self.guesser.train(POSITIVE, "piece of cake")
        self.guesser.train(POSITIVE, "good thing")
        self.guesser.train(POSITIVE, "hawt")
        self.guesser.train(POSITIVE, "hawtness")
        self.guesser.train(POSITIVE, "highly positive")
        self.guesser.train(POSITIVE, "my hero")
        self.guesser.train(POSITIVE, "yummy")
        self.guesser.train(POSITIVE, "awesome")
        self.guesser.train(POSITIVE, "congrats")
        self.guesser.train(POSITIVE, "would recommend")
        self.guesser.train(POSITIVE, "intellectual vigor")
        self.guesser.train(POSITIVE, "really neat")
        self.guesser.train(POSITIVE, "yay")
        self.guesser.train(POSITIVE, "ftw")
        self.guesser.train(POSITIVE, "I want")
        self.guesser.train(POSITIVE, "best looking")
        self.guesser.train(POSITIVE, "imrpessive")
        self.guesser.train(POSITIVE, "positive")
        self.guesser.train(POSITIVE, "thx")
        self.guesser.train(POSITIVE, "thanks")
        self.guesser.train(POSITIVE, "thank you")
        self.guesser.train(POSITIVE, "endorse")
        self.guesser.train(POSITIVE, "clearly superior")
        self.guesser.train(POSITIVE, "superior")
        self.guesser.train(POSITIVE, "really love")
        self.guesser.train(POSITIVE, "woot")
        self.guesser.train(POSITIVE, "w00t")
        self.guesser.train(POSITIVE, "super")
        self.guesser.train(POSITIVE, "wonderful")
        self.guesser.train(POSITIVE, "leaning towards")
        self.guesser.train(POSITIVE, "rally")
        self.guesser.train(POSITIVE, "incredible")
        self.guesser.train(POSITIVE, "the best")
        self.guesser.train(POSITIVE, "is the best")
        self.guesser.train(POSITIVE, "strong")
        self.guesser.train(POSITIVE, "would love")
        self.guesser.train(POSITIVE, "rally")
        self.guesser.train(POSITIVE, "very quickly")
        self.guesser.train(POSITIVE, "very cool")
        self.guesser.train(POSITIVE, "absolutely love")
        self.guesser.train(POSITIVE, "very exceptional")
        self.guesser.train(POSITIVE, "so proud")
        self.guesser.train(POSITIVE, "funny")
        self.guesser.train(POSITIVE, "recommend")
        self.guesser.train(POSITIVE, "so proud")
        self.guesser.train(POSITIVE, "so great")
        self.guesser.train(POSITIVE, "so cool")
        self.guesser.train(POSITIVE, "cool")
        self.guesser.train(POSITIVE, "wowsers")
        self.guesser.train(POSITIVE, "plus")
        self.guesser.train(POSITIVE, "liked it")
        self.guesser.train(POSITIVE, "make a difference")
        self.guesser.train(POSITIVE, "moves me")
        self.guesser.train(POSITIVE, "inspired")
        self.guesser.train(POSITIVE, "OK")
        self.guesser.train(POSITIVE, "love it")
        self.guesser.train(POSITIVE, "LOL")
        self.guesser.train(POSITIVE, ":)")
        self.guesser.train(POSITIVE, ";)")
        self.guesser.train(POSITIVE, ":-)")
        self.guesser.train(POSITIVE, ";-)")
        self.guesser.train(POSITIVE, ":D")
        self.guesser.train(POSITIVE, ";]")
        self.guesser.train(POSITIVE, ":]")
        self.guesser.train(POSITIVE, ":p")
        self.guesser.train(POSITIVE, ";p")
        self.guesser.train(POSITIVE, "voting for")
        self.guesser.train(POSITIVE, "great")
        self.guesser.train(POSITIVE, "agreeable")
        self.guesser.train(POSITIVE, "amused")
        self.guesser.train(POSITIVE, "brave")
        self.guesser.train(POSITIVE, "calm")
        self.guesser.train(POSITIVE, "charming")
        self.guesser.train(POSITIVE, "cheerful")
        self.guesser.train(POSITIVE, "comfortable")
        self.guesser.train(POSITIVE, "cooperative")
        self.guesser.train(POSITIVE, "courageous")
        self.guesser.train(POSITIVE, "delightful")
        self.guesser.train(POSITIVE, "determined")
        self.guesser.train(POSITIVE, "eager")
        self.guesser.train(POSITIVE, "elated")
        self.guesser.train(POSITIVE, "enchanting")
        self.guesser.train(POSITIVE, "encouraging")
        self.guesser.train(POSITIVE, "energetic")
        self.guesser.train(POSITIVE, "enthusiastic")
        self.guesser.train(POSITIVE, "excited")
        self.guesser.train(POSITIVE, "exuberant")
        self.guesser.train(POSITIVE, "excellent")
        self.guesser.train(POSITIVE, "I like")
        self.guesser.train(POSITIVE, "fine")
        self.guesser.train(POSITIVE, "fair")
        self.guesser.train(POSITIVE, "faithful")
        self.guesser.train(POSITIVE, "fantastic")
        self.guesser.train(POSITIVE, "fine")
        self.guesser.train(POSITIVE, "friendly")
        self.guesser.train(POSITIVE, "fun ")
        self.guesser.train(POSITIVE, "funny")
        self.guesser.train(POSITIVE, "gentle")
        self.guesser.train(POSITIVE, "glorious")
        self.guesser.train(POSITIVE, "good")
        self.guesser.train(POSITIVE, "pretty good")
        self.guesser.train(POSITIVE, "happy")
        self.guesser.train(POSITIVE, "healthy")
        self.guesser.train(POSITIVE, "helpful")
        self.guesser.train(POSITIVE, "high")
        self.guesser.train(POSITIVE, "agile")
        self.guesser.train(POSITIVE, "responsive")
        self.guesser.train(POSITIVE, "hilarious")
        self.guesser.train(POSITIVE, "jolly")
        self.guesser.train(POSITIVE, "joyous")
        self.guesser.train(POSITIVE, "kind")
        self.guesser.train(POSITIVE, "lively")
        self.guesser.train(POSITIVE, "lovely")
        self.guesser.train(POSITIVE, "lucky")
        self.guesser.train(POSITIVE, "nice")
        self.guesser.train(POSITIVE, "nicely")
        self.guesser.train(POSITIVE, "obedient")
        self.guesser.train(POSITIVE, "perfect")
        self.guesser.train(POSITIVE, "pleasant")
        self.guesser.train(POSITIVE, "proud")
        self.guesser.train(POSITIVE, "relieved")
        self.guesser.train(POSITIVE, "silly")
        self.guesser.train(POSITIVE, "smiling")
        self.guesser.train(POSITIVE, "splendid")
        self.guesser.train(POSITIVE, "successful")
        self.guesser.train(POSITIVE, "thankful")
        self.guesser.train(POSITIVE, "thoughtful")
        self.guesser.train(POSITIVE, "victorious")
        self.guesser.train(POSITIVE, "vivacious")
        self.guesser.train(POSITIVE, "witty")
        self.guesser.train(POSITIVE, "wonderful")
        self.guesser.train(POSITIVE, "zealous")
        self.guesser.train(POSITIVE, "zany")
        self.guesser.train(POSITIVE, "rocks")
        self.guesser.train(POSITIVE, "comeback")
        self.guesser.train(POSITIVE, "pleasantly surprised")
        self.guesser.train(POSITIVE, "pleasantly")
        self.guesser.train(POSITIVE, "surprised")
        self.guesser.train(POSITIVE, "love")
        self.guesser.train(POSITIVE, "glad")
        self.guesser.train(POSITIVE, "yum")
        self.guesser.train(POSITIVE, "interesting")

        self.guesser.train(NEGATIVE, "FTL")
        self.guesser.train(NEGATIVE, "irritating")
        self.guesser.train(NEGATIVE, "not that good")
        self.guesser.train(NEGATIVE, "suck")
        self.guesser.train(NEGATIVE, "lying")
        self.guesser.train(NEGATIVE, "duplicity")
        self.guesser.train(NEGATIVE, "angered")
        self.guesser.train(NEGATIVE, "dumbfounding")
        self.guesser.train(NEGATIVE, "dumbifying")
        self.guesser.train(NEGATIVE, "not as good")
        self.guesser.train(NEGATIVE, "not impressed")
        self.guesser.train(NEGATIVE, "stomach it")
        self.guesser.train(NEGATIVE, "pw")
        self.guesser.train(NEGATIVE, "pwns")
        self.guesser.train(NEGATIVE, "pwnd")
        self.guesser.train(NEGATIVE, "pwning")
        self.guesser.train(NEGATIVE, "in a bad way")
        self.guesser.train(NEGATIVE, "horrifying")
        self.guesser.train(NEGATIVE, "wrong")
        self.guesser.train(NEGATIVE, "flailing")
        self.guesser.train(NEGATIVE, "failing")
        self.guesser.train(NEGATIVE, "fallen way behind")
        self.guesser.train(NEGATIVE, "fallen behind")
        self.guesser.train(NEGATIVE, "lose")
        self.guesser.train(NEGATIVE, "fallen")
        self.guesser.train(NEGATIVE, "self-deprecating")
        self.guesser.train(NEGATIVE, "hunker down")
        self.guesser.train(NEGATIVE, "duh")
        self.guesser.train(NEGATIVE, "get killed by")
        self.guesser.train(NEGATIVE, "got killed by")
        self.guesser.train(NEGATIVE, "hated us")
        self.guesser.train(NEGATIVE, "only works in safari")
        self.guesser.train(NEGATIVE, "must have ie")
        self.guesser.train(NEGATIVE, "fuming and frothing")
        self.guesser.train(NEGATIVE, "heavy")
        self.guesser.train(NEGATIVE, "buggy")
        self.guesser.train(NEGATIVE, "unusable")
        self.guesser.train(NEGATIVE, "nothing is")
        self.guesser.train(NEGATIVE, "is great until")
        self.guesser.train(NEGATIVE, "don't support")
        self.guesser.train(NEGATIVE, "despise")
        self.guesser.train(NEGATIVE, "pos")
        self.guesser.train(NEGATIVE, "hindrance")
        self.guesser.train(NEGATIVE, "sucks")
        self.guesser.train(NEGATIVE, "problems")
        self.guesser.train(NEGATIVE, "not working")
        self.guesser.train(NEGATIVE, "fuming")
        self.guesser.train(NEGATIVE, "annoying")
        self.guesser.train(NEGATIVE, "frothing")
        self.guesser.train(NEGATIVE, "poorly")
        self.guesser.train(NEGATIVE, "headache")
        self.guesser.train(NEGATIVE, "completely wrong")
        self.guesser.train(NEGATIVE, "sad news")
        self.guesser.train(NEGATIVE, "didn't last")
        self.guesser.train(NEGATIVE, "lame")
        self.guesser.train(NEGATIVE, "pet peeves")
        self.guesser.train(NEGATIVE, "pet peeve")
        self.guesser.train(NEGATIVE, "can't send")
        self.guesser.train(NEGATIVE, "bullshit")
        self.guesser.train(NEGATIVE, "fail")
        self.guesser.train(NEGATIVE, "so terrible")
        self.guesser.train(NEGATIVE, "negative")
        self.guesser.train(NEGATIVE, "anooying")
        self.guesser.train(NEGATIVE, "an issue")
        self.guesser.train(NEGATIVE, "drop dead")
        self.guesser.train(NEGATIVE, "trouble")
        self.guesser.train(NEGATIVE, "brainwashed")
        self.guesser.train(NEGATIVE, "smear")
        self.guesser.train(NEGATIVE, "commie")
        self.guesser.train(NEGATIVE, "communist")
        self.guesser.train(NEGATIVE, "anti-women")
        self.guesser.train(NEGATIVE, "WTF")
        self.guesser.train(NEGATIVE, "anxiety")
        self.guesser.train(NEGATIVE, "STING")
        self.guesser.train(NEGATIVE, "nobody spoke")
        self.guesser.train(NEGATIVE, "yell")
        self.guesser.train(NEGATIVE, "Damn")
        self.guesser.train(NEGATIVE, "aren't")
        self.guesser.train(NEGATIVE, "anti")
        self.guesser.train(NEGATIVE, "i hate")
        self.guesser.train(NEGATIVE, "hate")
        self.guesser.train(NEGATIVE, "dissapointing")
        self.guesser.train(NEGATIVE, "doesn't recommend")
        self.guesser.train(NEGATIVE, "the worst")
        self.guesser.train(NEGATIVE, "worst")
        self.guesser.train(NEGATIVE, "expensive")
        self.guesser.train(NEGATIVE, "crap")
        self.guesser.train(NEGATIVE, "socialist")
        self.guesser.train(NEGATIVE, "won't")
        self.guesser.train(NEGATIVE, "wont")
        self.guesser.train(NEGATIVE, ":(")
        self.guesser.train(NEGATIVE, ":-(")
        self.guesser.train(NEGATIVE, "Thanks")
        self.guesser.train(NEGATIVE, "smartass")
        self.guesser.train(NEGATIVE, "don't like")
        self.guesser.train(NEGATIVE, "too bad")
        self.guesser.train(NEGATIVE, "frickin")
        self.guesser.train(NEGATIVE, "snooty")
        self.guesser.train(NEGATIVE, "knee jerk")
        self.guesser.train(NEGATIVE, "jerk")
        self.guesser.train(NEGATIVE, "reactionist")
        self.guesser.train(NEGATIVE, "MUST DIE")
        self.guesser.train(NEGATIVE, "no more")
        self.guesser.train(NEGATIVE, "hypocrisy")
        self.guesser.train(NEGATIVE, "ugly")
        self.guesser.train(NEGATIVE, "too slow")
        self.guesser.train(NEGATIVE, "not reliable")
        self.guesser.train(NEGATIVE, "noise")
        self.guesser.train(NEGATIVE, "crappy")
        self.guesser.train(NEGATIVE, "horrible")
        self.guesser.train(NEGATIVE, "bad quality")
        self.guesser.train(NEGATIVE, "angry")
        self.guesser.train(NEGATIVE, "annoyed")
        self.guesser.train(NEGATIVE, "anxious")
        self.guesser.train(NEGATIVE, "arrogant")
        self.guesser.train(NEGATIVE, "ashamed")
        self.guesser.train(NEGATIVE, "awful")
        self.guesser.train(NEGATIVE, "bad")
        self.guesser.train(NEGATIVE, "bewildered")
        self.guesser.train(NEGATIVE, "blues")
        self.guesser.train(NEGATIVE, "bored")
        self.guesser.train(NEGATIVE, "clumsy")
        self.guesser.train(NEGATIVE, "combative")
        self.guesser.train(NEGATIVE, "condemned")
        self.guesser.train(NEGATIVE, "confused")
        self.guesser.train(NEGATIVE, "crazy")
        self.guesser.train(NEGATIVE, "flipped-out")
        self.guesser.train(NEGATIVE, "creepy")
        self.guesser.train(NEGATIVE, "cruel")
        self.guesser.train(NEGATIVE, "dangerous")
        self.guesser.train(NEGATIVE, "defeated")
        self.guesser.train(NEGATIVE, "defiant")
        self.guesser.train(NEGATIVE, "depressed")
        self.guesser.train(NEGATIVE, "disgusted")
        self.guesser.train(NEGATIVE, "disturbed")
        self.guesser.train(NEGATIVE, "dizzy")
        self.guesser.train(NEGATIVE, "dull")
        self.guesser.train(NEGATIVE, "embarrassed")
        self.guesser.train(NEGATIVE, "envious")
        self.guesser.train(NEGATIVE, "evil")
        self.guesser.train(NEGATIVE, "fierce")
        self.guesser.train(NEGATIVE, "foolish")
        self.guesser.train(NEGATIVE, "frantic")
        self.guesser.train(NEGATIVE, "frightened")
        self.guesser.train(NEGATIVE, "grieving")
        self.guesser.train(NEGATIVE, "grumpy")
        self.guesser.train(NEGATIVE, "helpless")
        self.guesser.train(NEGATIVE, "homeless")
        self.guesser.train(NEGATIVE, "hungry")
        self.guesser.train(NEGATIVE, "hurt")
        self.guesser.train(NEGATIVE, "ill")
        self.guesser.train(NEGATIVE, "itchy")
        self.guesser.train(NEGATIVE, "jealous")
        self.guesser.train(NEGATIVE, "jittery")
        self.guesser.train(NEGATIVE, "lazy")
        self.guesser.train(NEGATIVE, "lonely")
        self.guesser.train(NEGATIVE, "mysterious")
        self.guesser.train(NEGATIVE, "nasty")
        self.guesser.train(NEGATIVE, "rape")
        self.guesser.train(NEGATIVE, "naughty")
        self.guesser.train(NEGATIVE, "nervous")
        self.guesser.train(NEGATIVE, "nutty")
        self.guesser.train(NEGATIVE, "obnoxious")
        self.guesser.train(NEGATIVE, "outrageous")
        self.guesser.train(NEGATIVE, "panicky")
        self.guesser.train(NEGATIVE, "f*****g up")
        self.guesser.train(NEGATIVE, "repulsive")
        self.guesser.train(NEGATIVE, "scary")
        self.guesser.train(NEGATIVE, "selfish")
        self.guesser.train(NEGATIVE, "sore")
        self.guesser.train(NEGATIVE, "tense")
        self.guesser.train(NEGATIVE, "terrible")
        self.guesser.train(NEGATIVE, "testy")
        self.guesser.train(NEGATIVE, "thoughtless")
        self.guesser.train(NEGATIVE, "tired")
        self.guesser.train(NEGATIVE, "troubled")
        self.guesser.train(NEGATIVE, "upset")
        self.guesser.train(NEGATIVE, "uptight")
        self.guesser.train(NEGATIVE, "weary")
        self.guesser.train(NEGATIVE, "wicked")
        self.guesser.train(NEGATIVE, "worried")
        self.guesser.train(NEGATIVE, "is a fool")
        self.guesser.train(NEGATIVE, "painful")
        self.guesser.train(NEGATIVE, "pain")
        self.guesser.train(NEGATIVE, "gross")

    def classify(self, sentence):
        guess = self.guesser.guess(sentence)
        if len(guess) == 0:
            return NEUTRAL

        if len(guess) == 1:
            (sentiment, probabitily) = guess[0]
            return sentiment

        (max_sentiment, max_value) = guess[0]
        (min_sentiment, min_value) = guess[1]
        if max_value - min_value > self.THRESHHOLD:
            return max_sentiment

        return NEUTRAL
Example #41
0
class BayesianClassifier:

  POSITIVE = POSITIVE
  NEGATIVE = NEGATIVE
  NEUTRAL  = NEUTRAL

  THRESHHOLD = 0.1
  guesser = None

  def __init__(self):
    self.guesser = Bayes()

  def train(self, example_tweets):
    for t in example_tweets:
      self.guesser.train(t.sentiment, t.message)

    self.guesser.train(POSITIVE, "cool")
    self.guesser.train(POSITIVE, "Woo")
    self.guesser.train(POSITIVE, "quite amazing")
    self.guesser.train(POSITIVE, "thks")
    self.guesser.train(POSITIVE, "looking forward to")
    self.guesser.train(POSITIVE, "damn good")
    self.guesser.train(POSITIVE, "frickin ruled")
    self.guesser.train(POSITIVE, "frickin rules")
    self.guesser.train(POSITIVE, "Way to go")
    self.guesser.train(POSITIVE, "cute")
    self.guesser.train(POSITIVE, "comeback")
    self.guesser.train(POSITIVE, "not suck")
    self.guesser.train(POSITIVE, "prop")
    self.guesser.train(POSITIVE, "kinda impressed")
    self.guesser.train(POSITIVE, "props")
    self.guesser.train(POSITIVE, "come on")
    self.guesser.train(POSITIVE, "congratulation")
    self.guesser.train(POSITIVE, "gtd")
    self.guesser.train(POSITIVE, "proud")
    self.guesser.train(POSITIVE, "thanks")
    self.guesser.train(POSITIVE, "can help")
    self.guesser.train(POSITIVE, "thanks!")
    self.guesser.train(POSITIVE, "pumped")
    self.guesser.train(POSITIVE, "integrate")
    self.guesser.train(POSITIVE, "really like")
    self.guesser.train(POSITIVE, "loves it")
    self.guesser.train(POSITIVE, "yay")
    self.guesser.train(POSITIVE, "amazing")
    self.guesser.train(POSITIVE, "epic flail")
    self.guesser.train(POSITIVE, "flail")
    self.guesser.train(POSITIVE, "good luck")
    self.guesser.train(POSITIVE, "fail")
    self.guesser.train(POSITIVE, "life saver")
    self.guesser.train(POSITIVE, "piece of cake")
    self.guesser.train(POSITIVE, "good thing")
    self.guesser.train(POSITIVE, "hawt")
    self.guesser.train(POSITIVE, "hawtness")
    self.guesser.train(POSITIVE, "highly positive")
    self.guesser.train(POSITIVE, "my hero")
    self.guesser.train(POSITIVE, "yummy")
    self.guesser.train(POSITIVE, "awesome")
    self.guesser.train(POSITIVE, "congrats")
    self.guesser.train(POSITIVE, "would recommend")
    self.guesser.train(POSITIVE, "intellectual vigor")
    self.guesser.train(POSITIVE, "really neat")
    self.guesser.train(POSITIVE, "yay")
    self.guesser.train(POSITIVE, "ftw")
    self.guesser.train(POSITIVE, "I want")
    self.guesser.train(POSITIVE, "best looking")
    self.guesser.train(POSITIVE, "imrpessive")
    self.guesser.train(POSITIVE, "positive")
    self.guesser.train(POSITIVE, "thx")
    self.guesser.train(POSITIVE, "thanks")
    self.guesser.train(POSITIVE, "thank you")
    self.guesser.train(POSITIVE, "endorse")
    self.guesser.train(POSITIVE, "clearly superior")
    self.guesser.train(POSITIVE, "superior")
    self.guesser.train(POSITIVE, "really love")
    self.guesser.train(POSITIVE, "woot")
    self.guesser.train(POSITIVE, "w00t")
    self.guesser.train(POSITIVE, "super")
    self.guesser.train(POSITIVE, "wonderful")
    self.guesser.train(POSITIVE, "leaning towards")
    self.guesser.train(POSITIVE, "rally")
    self.guesser.train(POSITIVE, "incredible")
    self.guesser.train(POSITIVE, "the best")
    self.guesser.train(POSITIVE, "is the best")
    self.guesser.train(POSITIVE, "strong")
    self.guesser.train(POSITIVE, "would love")
    self.guesser.train(POSITIVE, "rally")
    self.guesser.train(POSITIVE, "very quickly")
    self.guesser.train(POSITIVE, "very cool")
    self.guesser.train(POSITIVE, "absolutely love")
    self.guesser.train(POSITIVE, "very exceptional")
    self.guesser.train(POSITIVE, "so proud")
    self.guesser.train(POSITIVE, "funny")
    self.guesser.train(POSITIVE, "recommend")
    self.guesser.train(POSITIVE, "so proud")
    self.guesser.train(POSITIVE, "so great")
    self.guesser.train(POSITIVE, "so cool")
    self.guesser.train(POSITIVE, "cool")
    self.guesser.train(POSITIVE, "wowsers")
    self.guesser.train(POSITIVE, "plus")
    self.guesser.train(POSITIVE, "liked it")
    self.guesser.train(POSITIVE, "make a difference")
    self.guesser.train(POSITIVE, "moves me")
    self.guesser.train(POSITIVE, "inspired")
    self.guesser.train(POSITIVE, "OK")
    self.guesser.train(POSITIVE, "love it")
    self.guesser.train(POSITIVE, "LOL")
    self.guesser.train(POSITIVE, ":)")
    self.guesser.train(POSITIVE, ";)")
    self.guesser.train(POSITIVE, ":-)")
    self.guesser.train(POSITIVE, ";-)")
    self.guesser.train(POSITIVE, ":D")
    self.guesser.train(POSITIVE, ";]")
    self.guesser.train(POSITIVE, ":]")
    self.guesser.train(POSITIVE, ":p")
    self.guesser.train(POSITIVE, ";p")
    self.guesser.train(POSITIVE, "voting for")
    self.guesser.train(POSITIVE, "great")
    self.guesser.train(POSITIVE, "agreeable")
    self.guesser.train(POSITIVE, "amused")
    self.guesser.train(POSITIVE, "brave")
    self.guesser.train(POSITIVE, "calm")
    self.guesser.train(POSITIVE, "charming")
    self.guesser.train(POSITIVE, "cheerful")
    self.guesser.train(POSITIVE, "comfortable")
    self.guesser.train(POSITIVE, "cooperative")
    self.guesser.train(POSITIVE, "courageous")
    self.guesser.train(POSITIVE, "delightful")
    self.guesser.train(POSITIVE, "determined")
    self.guesser.train(POSITIVE, "eager")
    self.guesser.train(POSITIVE, "elated")
    self.guesser.train(POSITIVE, "enchanting")
    self.guesser.train(POSITIVE, "encouraging")
    self.guesser.train(POSITIVE, "energetic")
    self.guesser.train(POSITIVE, "enthusiastic")
    self.guesser.train(POSITIVE, "excited")
    self.guesser.train(POSITIVE, "exuberant")
    self.guesser.train(POSITIVE, "excellent")
    self.guesser.train(POSITIVE, "I like")
    self.guesser.train(POSITIVE, "fine")
    self.guesser.train(POSITIVE, "fair")
    self.guesser.train(POSITIVE, "faithful")
    self.guesser.train(POSITIVE, "fantastic")
    self.guesser.train(POSITIVE, "fine")
    self.guesser.train(POSITIVE, "friendly")
    self.guesser.train(POSITIVE, "fun ")
    self.guesser.train(POSITIVE, "funny")
    self.guesser.train(POSITIVE, "gentle")
    self.guesser.train(POSITIVE, "glorious")
    self.guesser.train(POSITIVE, "good")
    self.guesser.train(POSITIVE, "pretty good")
    self.guesser.train(POSITIVE, "happy")
    self.guesser.train(POSITIVE, "healthy")
    self.guesser.train(POSITIVE, "helpful")
    self.guesser.train(POSITIVE, "high")
    self.guesser.train(POSITIVE, "agile")
    self.guesser.train(POSITIVE, "responsive")
    self.guesser.train(POSITIVE, "hilarious")
    self.guesser.train(POSITIVE, "jolly")
    self.guesser.train(POSITIVE, "joyous")
    self.guesser.train(POSITIVE, "kind")
    self.guesser.train(POSITIVE, "lively")
    self.guesser.train(POSITIVE, "lovely")
    self.guesser.train(POSITIVE, "lucky")
    self.guesser.train(POSITIVE, "nice")
    self.guesser.train(POSITIVE, "nicely")
    self.guesser.train(POSITIVE, "obedient")
    self.guesser.train(POSITIVE, "perfect")
    self.guesser.train(POSITIVE, "pleasant")
    self.guesser.train(POSITIVE, "proud")
    self.guesser.train(POSITIVE, "relieved")
    self.guesser.train(POSITIVE, "silly")
    self.guesser.train(POSITIVE, "smiling")
    self.guesser.train(POSITIVE, "splendid")
    self.guesser.train(POSITIVE, "successful")
    self.guesser.train(POSITIVE, "thankful")
    self.guesser.train(POSITIVE, "thoughtful")
    self.guesser.train(POSITIVE, "victorious")
    self.guesser.train(POSITIVE, "vivacious")
    self.guesser.train(POSITIVE, "witty")
    self.guesser.train(POSITIVE, "wonderful")
    self.guesser.train(POSITIVE, "zealous")
    self.guesser.train(POSITIVE, "zany")
    self.guesser.train(POSITIVE, "rocks")
    self.guesser.train(POSITIVE, "comeback")
    self.guesser.train(POSITIVE, "pleasantly surprised")
    self.guesser.train(POSITIVE, "pleasantly")
    self.guesser.train(POSITIVE, "surprised")
    self.guesser.train(POSITIVE, "love")
    self.guesser.train(POSITIVE, "glad")
    self.guesser.train(POSITIVE, "yum")
    self.guesser.train(POSITIVE, "interesting")



    self.guesser.train(NEGATIVE, "FTL")
    self.guesser.train(NEGATIVE, "f**k")
    self.guesser.train(NEGATIVE, "irritating")
    self.guesser.train(NEGATIVE, "not that good")
    self.guesser.train(NEGATIVE, "suck")
    self.guesser.train(NEGATIVE, "lying")
    self.guesser.train(NEGATIVE, "duplicity")
    self.guesser.train(NEGATIVE, "angered")
    self.guesser.train(NEGATIVE, "dumbfounding")
    self.guesser.train(NEGATIVE, "dumbifying")
    self.guesser.train(NEGATIVE, "not as good")
    self.guesser.train(NEGATIVE, "not impressed")
    self.guesser.train(NEGATIVE, "stomach it")
    self.guesser.train(NEGATIVE, "pw")
    self.guesser.train(NEGATIVE, "pwns")
    self.guesser.train(NEGATIVE, "pwnd")
    self.guesser.train(NEGATIVE, "pwning")
    self.guesser.train(NEGATIVE, "in a bad way")
    self.guesser.train(NEGATIVE, "horrifying")
    self.guesser.train(NEGATIVE, "wrong")
    self.guesser.train(NEGATIVE, "flailing")
    self.guesser.train(NEGATIVE, "failing")
    self.guesser.train(NEGATIVE, "fallen way behind")
    self.guesser.train(NEGATIVE, "fallen behind")
    self.guesser.train(NEGATIVE, "lose")
    self.guesser.train(NEGATIVE, "fallen")
    self.guesser.train(NEGATIVE, "self-deprecating")
    self.guesser.train(NEGATIVE, "hunker down")
    self.guesser.train(NEGATIVE, "duh")
    self.guesser.train(NEGATIVE, "get killed by")
    self.guesser.train(NEGATIVE, "got killed by")
    self.guesser.train(NEGATIVE, "hated us")
    self.guesser.train(NEGATIVE, "only works in safari")
    self.guesser.train(NEGATIVE, "must have ie")
    self.guesser.train(NEGATIVE, "fuming and frothing")
    self.guesser.train(NEGATIVE, "heavy")
    self.guesser.train(NEGATIVE, "buggy")
    self.guesser.train(NEGATIVE, "unusable")
    self.guesser.train(NEGATIVE, "nothing is")
    self.guesser.train(NEGATIVE, "is great until")
    self.guesser.train(NEGATIVE, "don't support")
    self.guesser.train(NEGATIVE, "despise")
    self.guesser.train(NEGATIVE, "pos")
    self.guesser.train(NEGATIVE, "hindrance")
    self.guesser.train(NEGATIVE, "sucks")
    self.guesser.train(NEGATIVE, "problems")
    self.guesser.train(NEGATIVE, "not working")
    self.guesser.train(NEGATIVE, "fuming")
    self.guesser.train(NEGATIVE, "annoying")
    self.guesser.train(NEGATIVE, "frothing")
    self.guesser.train(NEGATIVE, "poorly")
    self.guesser.train(NEGATIVE, "headache")
    self.guesser.train(NEGATIVE, "completely wrong")
    self.guesser.train(NEGATIVE, "sad news")
    self.guesser.train(NEGATIVE, "didn't last")
    self.guesser.train(NEGATIVE, "lame")
    self.guesser.train(NEGATIVE, "pet peeves")
    self.guesser.train(NEGATIVE, "pet peeve")
    self.guesser.train(NEGATIVE, "can't send")
    self.guesser.train(NEGATIVE, "bullshit")
    self.guesser.train(NEGATIVE, "fail")
    self.guesser.train(NEGATIVE, "so terrible")
    self.guesser.train(NEGATIVE, "negative")
    self.guesser.train(NEGATIVE, "anooying")
    self.guesser.train(NEGATIVE, "an issue")
    self.guesser.train(NEGATIVE, "drop dead")
    self.guesser.train(NEGATIVE, "trouble")
    self.guesser.train(NEGATIVE, "brainwashed")
    self.guesser.train(NEGATIVE, "smear")
    self.guesser.train(NEGATIVE, "commie")
    self.guesser.train(NEGATIVE, "communist")
    self.guesser.train(NEGATIVE, "anti-women")
    self.guesser.train(NEGATIVE, "WTF")
    self.guesser.train(NEGATIVE, "anxiety")
    self.guesser.train(NEGATIVE, "STING")
    self.guesser.train(NEGATIVE, "nobody spoke")
    self.guesser.train(NEGATIVE, "yell")
    self.guesser.train(NEGATIVE, "Damn")
    self.guesser.train(NEGATIVE, "aren't")
    self.guesser.train(NEGATIVE, "anti")
    self.guesser.train(NEGATIVE, "i hate")
    self.guesser.train(NEGATIVE, "hate")
    self.guesser.train(NEGATIVE, "dissapointing")
    self.guesser.train(NEGATIVE, "doesn't recommend")
    self.guesser.train(NEGATIVE, "the worst")
    self.guesser.train(NEGATIVE, "worst")
    self.guesser.train(NEGATIVE, "expensive")
    self.guesser.train(NEGATIVE, "crap")
    self.guesser.train(NEGATIVE, "socialist")
    self.guesser.train(NEGATIVE, "won't")
    self.guesser.train(NEGATIVE, "wont")
    self.guesser.train(NEGATIVE, ":(")
    self.guesser.train(NEGATIVE, ":-(")
    self.guesser.train(NEGATIVE, "Thanks")
    self.guesser.train(NEGATIVE, "smartass")
    self.guesser.train(NEGATIVE, "don't like")
    self.guesser.train(NEGATIVE, "too bad")
    self.guesser.train(NEGATIVE, "frickin")
    self.guesser.train(NEGATIVE, "snooty")
    self.guesser.train(NEGATIVE, "knee jerk")
    self.guesser.train(NEGATIVE, "jerk")
    self.guesser.train(NEGATIVE, "reactionist")
    self.guesser.train(NEGATIVE, "MUST DIE")
    self.guesser.train(NEGATIVE, "no more")
    self.guesser.train(NEGATIVE, "hypocrisy")
    self.guesser.train(NEGATIVE, "ugly")
    self.guesser.train(NEGATIVE, "too slow")
    self.guesser.train(NEGATIVE, "not reliable")
    self.guesser.train(NEGATIVE, "noise")
    self.guesser.train(NEGATIVE, "crappy")
    self.guesser.train(NEGATIVE, "horrible")
    self.guesser.train(NEGATIVE, "bad quality")
    self.guesser.train(NEGATIVE, "angry")
    self.guesser.train(NEGATIVE, "annoyed")
    self.guesser.train(NEGATIVE, "anxious")
    self.guesser.train(NEGATIVE, "arrogant")
    self.guesser.train(NEGATIVE, "ashamed")
    self.guesser.train(NEGATIVE, "awful")
    self.guesser.train(NEGATIVE, "bad")
    self.guesser.train(NEGATIVE, "bewildered")
    self.guesser.train(NEGATIVE, "blues")
    self.guesser.train(NEGATIVE, "bored")
    self.guesser.train(NEGATIVE, "clumsy")
    self.guesser.train(NEGATIVE, "combative")
    self.guesser.train(NEGATIVE, "condemned")
    self.guesser.train(NEGATIVE, "confused")
    self.guesser.train(NEGATIVE, "crazy")
    self.guesser.train(NEGATIVE, "flipped-out")
    self.guesser.train(NEGATIVE, "creepy")
    self.guesser.train(NEGATIVE, "cruel")
    self.guesser.train(NEGATIVE, "dangerous")
    self.guesser.train(NEGATIVE, "defeated")
    self.guesser.train(NEGATIVE, "defiant")
    self.guesser.train(NEGATIVE, "depressed")
    self.guesser.train(NEGATIVE, "disgusted")
    self.guesser.train(NEGATIVE, "disturbed")
    self.guesser.train(NEGATIVE, "dizzy")
    self.guesser.train(NEGATIVE, "dull")
    self.guesser.train(NEGATIVE, "embarrassed")
    self.guesser.train(NEGATIVE, "envious")
    self.guesser.train(NEGATIVE, "evil")
    self.guesser.train(NEGATIVE, "fierce")
    self.guesser.train(NEGATIVE, "foolish")
    self.guesser.train(NEGATIVE, "frantic")
    self.guesser.train(NEGATIVE, "frightened")
    self.guesser.train(NEGATIVE, "grieving")
    self.guesser.train(NEGATIVE, "grumpy")
    self.guesser.train(NEGATIVE, "helpless")
    self.guesser.train(NEGATIVE, "homeless")
    self.guesser.train(NEGATIVE, "hungry")
    self.guesser.train(NEGATIVE, "hurt")
    self.guesser.train(NEGATIVE, "ill")
    self.guesser.train(NEGATIVE, "itchy")
    self.guesser.train(NEGATIVE, "jealous")
    self.guesser.train(NEGATIVE, "jittery")
    self.guesser.train(NEGATIVE, "lazy")
    self.guesser.train(NEGATIVE, "lonely")
    self.guesser.train(NEGATIVE, "mysterious")
    self.guesser.train(NEGATIVE, "nasty")
    self.guesser.train(NEGATIVE, "rape")
    self.guesser.train(NEGATIVE, "naughty")
    self.guesser.train(NEGATIVE, "nervous")
    self.guesser.train(NEGATIVE, "nutty")
    self.guesser.train(NEGATIVE, "obnoxious")
    self.guesser.train(NEGATIVE, "outrageous")
    self.guesser.train(NEGATIVE, "panicky")
    self.guesser.train(NEGATIVE, "f*****g up")
    self.guesser.train(NEGATIVE, "repulsive")
    self.guesser.train(NEGATIVE, "scary")
    self.guesser.train(NEGATIVE, "selfish")
    self.guesser.train(NEGATIVE, "sore")
    self.guesser.train(NEGATIVE, "tense")
    self.guesser.train(NEGATIVE, "terrible")
    self.guesser.train(NEGATIVE, "testy")
    self.guesser.train(NEGATIVE, "thoughtless")
    self.guesser.train(NEGATIVE, "tired")
    self.guesser.train(NEGATIVE, "troubled")
    self.guesser.train(NEGATIVE, "upset")
    self.guesser.train(NEGATIVE, "uptight")
    self.guesser.train(NEGATIVE, "weary")
    self.guesser.train(NEGATIVE, "wicked")
    self.guesser.train(NEGATIVE, "worried")
    self.guesser.train(NEGATIVE, "is a fool")
    self.guesser.train(NEGATIVE, "painful")
    self.guesser.train(NEGATIVE, "pain")
    self.guesser.train(NEGATIVE, "gross")

  def classify(self, sentence):
    guess = self.guesser.guess(sentence)
    if len(guess) == 0:
      return NEUTRAL

    if len(guess) == 1:
      (sentiment, probabitily) = guess[0]
      return sentiment

    (max_sentiment, max_value) = guess[0]
    (min_sentiment, min_value) = guess[1]
    if max_value - min_value > self.THRESHHOLD:
      return max_sentiment

    return NEUTRAL
Example #42
0
class NaiveBayesClassifier(object):
    def __init__(self, non_spam_train_dir, spam_train_dir):
        self.non_spam_train_dir = non_spam_train_dir
        self.spam_train_dir = spam_train_dir
        self.naive_bayes_classifier = Bayes()
        self.total_num_train_files = 0
        self.total_num_test_files = 0
        self.num_misclass = 0

    def make_single_line_from_body_of_file(self, filename):
        fd = open(filename)
        total = ''
        return total.join(line.strip() for line in fd)

    def train(self):
        for subdir, dirs, files in os.walk(self.non_spam_train_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                self.naive_bayes_classifier.train('nonspam', full_text_line)


        for subdir, dirs, files in os.walk(self.spam_train_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                self.naive_bayes_classifier.train('spam', full_text_line)

    def train_for_given_dirs(self, non_spam_train_dir, spam_train_dir):
        for subdir, dirs, files in os.walk(non_spam_train_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                self.naive_bayes_classifier.train('nonspam', full_text_line)


        for subdir, dirs, files in os.walk(spam_train_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                self.naive_bayes_classifier.train('spam', full_text_line)

    def train_two_files(self):
        fd = open(self.non_spam_train_dir, 'r')
        for line in fd:
            self.naive_bayes_classifier.train('nonspam', line)

        fd = open(self.spam_train_dir, 'r')
        for line in fd:
            self.naive_bayes_classifier.train('spam', line)

    def train_for_two_exogenous_files(self, non_spam_train_file, spam_train_file):
        if non_spam_train_file != '':
            fd = open(non_spam_train_file, 'r')
            for line in fd:
                self.naive_bayes_classifier.train('nonspam', line)
        if spam_train_file != '':
            fd = open(spam_train_file, 'r')
            for line in fd:
                self.naive_bayes_classifier.train('spam', line)

    def test(self, non_spam_test_dir, spam_test_dir):
        # rb.classify('sloths are so cute i love them') == 'good'
        for subdir, dirs, files in os.walk(non_spam_test_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                class_prob_vec = self.naive_bayes_classifier.guess(full_text_line)
                self.total_num_test_files += 1
                y_hat = class_prob_vec[0][0]
                if y_hat != 'nonspam':
                    self.num_misclass += 1

        for subdir, dirs, files in os.walk(spam_test_dir):
            for file_i in files:
                self.total_num_train_files += 1
                filename = os.path.join(subdir, file_i)
                full_text_line = self.make_single_line_from_body_of_file(filename)
                class_prob_vec = self.naive_bayes_classifier.guess(full_text_line)
                self.total_num_test_files += 1
                y_hat = class_prob_vec[0][0]
                print class_prob_vec
                if y_hat != 'spam':
                    self.num_misclass += 1
        misclass_rate = (self.num_misclass/float(self.total_num_test_files))
        accuracy = 1 - misclass_rate
        print 'Misclassification rate is %f' % misclass_rate
        print 'Accuracy is %f' % accuracy

    def make_single_line_from_body(self, text_body):
        total = ' '
        return total.join(line.strip() for line in text_body)

    def make_single_line_from_body2(self, text_body):
        total = ''
        for line in text_body:
            total += line + ' '
        return total

    def create_nonspam_spam_datasets(self, text_body):
        return text_body

    def classify(self, text_body):
        class_prob_vec = self.naive_bayes_classifier.guess(text_body)
        y_hat = 'nonspam'
        if len(class_prob_vec) != 0:
            y_hat = class_prob_vec[0][0]
        return y_hat
Example #43
0
 def save(self):
     """
 Save the brain to disk
 """
     Bayes.save(self, self.brain)
Example #44
0
'Goal:  Build a language recognizer using a naive bayesian classifier'

# Make a 50 language reconizer trained on 10 books per language at:
# http://www.gutenberg.org/browse/languages/en
# http://www.gutenberg.org/files/1342/1342-0.txt

from reverend.thomas import Bayes

# Train the classifier
language_sniffer = Bayes()
for lang in ['en', 'es', 'fr', 'de', 'it']:
    filename = 'notes/proverbs_%s.txt' % lang
    with open(filename) as f:
        data = f.read().decode('utf-8')
        language_sniffer.train(lang, data)

# Apply the classifier
phrases = u'''\
All the leaves are brown and the sky is gray.  I've been for a walk on a winter's day.
De colores, todos los colores. De colores se visten los campos en la primavera.
Jingle bells, jingle all the way. Oh what fun it is to ride in a one horse open sleigh.
Casca belles, hoy es navidad.  Es un dia, de allegria y felicidad.
'''.splitlines()

for phrase in phrases:
    best_guess = language_sniffer.guess(phrase)[0][0]
    print best_guess, '<--', phrase[:30]
Example #45
0
 def untrain(self, bucket, words):
     """
 Remove nominated words from the relevant bucket
 """
     Bayes.untrain(self, bucket, words)
     Bayes.save(self, self.brain)
from reverend.thomas import Bayes

guesser = Bayes()
guesser.train('fish', 'salmon trout cod carp')
guesser.train('fowl', 'hen chicken duck goose')

guesser.guess('chicken tikka marsala')

guesser.untrain('fish', 'salmon carp')
Example #47
0
    def action_train(self, cr, uid, ids, context=None):
        cat_obj = self.pool.get('crm.bayes.categories')
        group_obj = self.pool.get('crm.bayes.group')
        message_obj = self.pool.get('crm.bayes.test.guess')

        for id in ids:
            cat_id = self.read(cr, uid, id, ['category_id', 'name'])
            cat_id = cat_id[0]['category_id']
            if result:
                max_list = max(result, key=lambda k: k[1])
                if cat_id:
                    cat_guess_msg = cat_obj.read(cr, uid, cat_id,
                                                 ['train_messages'])
                    cat_obj.write(cr, uid, cat_id, {
                        'train_messages':
                        cat_guess_msg['train_messages'] + 1
                    })
                if max_list[1] > 0 and not cat_id:
                    cat_id = cat_obj.search(cr, uid,
                                            [('name', '=', max_list[0])])[0]
                    cat_guess_msg = cat_obj.read(cr, uid, cat_id,
                                                 ['guess_messages'])
                    cat_obj.write(cr, uid, cat_id, {
                        'guess_messages':
                        cat_guess_msg['guess_messages'] + 1
                    })
                    self.write(cr, uid, ids, {'category_id': cat_id})
            if cat_id:
                cat_rec = cat_obj.read(cr, uid, cat_id, [])
                guesser = Bayes()
                data = ""
                for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                    if rec['train_data']:
                        data += rec['train_data']
                if data:
                    myfile = file(file_path + "crm_bayes.bay", 'w')
                    myfile.write(data)
                    myfile.close()
                    guesser.load(file_path + "crm_bayes.bay")

                guesser.train(cat_rec['name'],
                              message_obj.read(cr, uid, id)[0]['name'])
                guesser.save(file_path + "crm_bayes.bay")
                myfile = file(file_path + "crm_bayes.bay", 'r')
                data = ""
                for fi in myfile.readlines():
                    data += fi
                cr.execute(
                    "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"
                    % cat_rec['group_id'][0])
                rec = cr.dictfetchall()
                if not rec[0]['tot_guess']:
                    rec[0]['tot_guess'] = 0
                percantage = float(
                    rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] +
                                                       rec[0]['tot_train'])
                group_obj.write(cr, uid, cat_rec['group_id'][0], {
                    'train_data': data,
                    'automate_test': percantage
                })
            else:
                raise osv.except_osv(_('Error !'),
                                     _('Please Select Category! '))
        return {
            'view_type': 'form',
            "view_mode": 'form',
            'res_model': 'crm.bayes.train.message',
            'type': 'ir.actions.act_window',
            'target': 'new',
        }
Example #48
0
from reverend.thomas import Bayes
guesser = Bayes()
guesser.train('french', 'le la les du un une je il elle de en')
guesser.train('german', 'der die das ein eine')
guesser.train('spanish', 'el uno una las de la en')
guesser.train('english', 'the it she he they them are were to')
guesser.guess('they went to el cantina')
guesser.guess('they were flying planes')
guesser.train('english', 'the rain in spain falls mainly on the plain')
guesser.save('my_guesser.bay')
"""
pip install reverend
pip install sets
Source Code :https://laslabs.github.io/python-reverend/_modules/reverend/thomas.html
Overview of Bayes Rule: https://towardsdatascience.com/bayes-rule-with-a-simple-and-practical-example-2bce3d0f4ad0
"""
from reverend.thomas import Bayes
g = Bayes()    # guesser
g.train('french','La souris est rentre dans son trou.')
g.train('english','my tailor is rich.')
g.train('french','Je ne sais pas si je viendrai demain.')
g.train('english','I do not plan to update my website soon and I would really like some help from the rest of you idiots.')

print(g.guess('Jumping out of cliffs it not a good idea.'))

# print(g.guess('Demain il fera trs probablement chaud.'))
Example #50
0
 def guess(self, line):
     """
 Guess what category these words apply to
 """
     #print Bayes.guess(self,line)
     return Bayes.guess(self, line)
Example #51
0
class Trainer(Frame):
    def __init__(self, parent, guesser=None, itemClass=None):
        self.status = StatusBar(parent)
        self.status.pack(side=BOTTOM, fill=X)
        Frame.__init__(self, parent)
        self.pack(side=TOP, fill=BOTH)
        self.itemsPerPage = 20
        self.rows = []
        for i in range(self.itemsPerPage):
            self.rows.append(ItemRow())
        self.items = []
        self.files = []
        self.cursor = 0
        self.dirty = False
        if guesser is None:
            from reverend.thomas import Bayes
            self.guesser = Bayes()
        else:
            self.guesser = guesser
        if itemClass is None:
            self.itemClass = TextItem
        else:
            self.itemClass = itemClass
        for row in self.rows:
            row.summary.set('foo')
        self.initViews()

    def initViews(self):
        self.nb = Notebook(self)
##        frame1 = Frame(self.nb())
##        self.poolView = PoolView(frame1, guesser=self.guesser, app=self)
##        self.poolView.pack(side=TOP)
        frame2 = Frame(self.nb())
        self.poolView = PoolView(frame2, guesser=self.guesser, app=self)
        self.poolView.pack(side=TOP)
        self.listView = Canvas(frame2, relief=GROOVE)
        self.listView.pack(padx=3)
        bn = Button(self.listView, text="Load training", command=self.loadCorpus)
        bn.pack(side=RIGHT, anchor=NE, fill=X)
        self.columnHeadings()
        self.addNextPrev()
        
        frame3 = Frame(self.nb())
        self.testView = TestView(frame3, guesser=self.guesser, app=self)
        self.testView.pack()

        frame4 = Frame(self.nb())
        bp = Button(frame4, text="Quit", command=self.quitNow)
        bp.pack(side=BOTTOM)
        
        #self.nb.add_screen(frame1, 'Reverend')
        self.nb.add_screen(frame2, 'Training')
        self.nb.add_screen(frame3, 'Testing')
        self.nb.add_screen(frame4, 'Quit')
        

    def addNextPrev(self):
        npFrame = Frame(self.listView)
        npFrame.pack(side=BOTTOM, fill=X)
        bn = Button(npFrame, text="Prev Page", command=self.prevPage)
        bn.grid(row=0, column=0)
        bn = Button(npFrame, text="Next Page", command=self.nextPage)
        bn.grid(row=0, column=1)


    def loadCorpus(self):
        path = tkFileDialog.askdirectory()
        if not path:
            return
        self.loadFileList(path)
        self.displayItems()
        self.displayRows()

    def bulkTest(self):
        dirs = []
        for pool in self.guesser.poolNames():
            path = tkFileDialog.askdirectory()
            dirs.append((pool, path))
        for pool, path in dirs:
            print pool, path
            

    def displayList(self):
        for item in self.items:
            self.itemRow(item)
            
    def displayRows(self):
        for row in self.rows:
            self.displayRow(row)

    def loadFileList(self, path):
        listing = os.listdir(path)
        self.files = [os.path.join(path, file) for file in listing]
        self.cursor = 0

    def prevPage(self):
        self.cursor = max(0, self.cursor - self.itemsPerPage)
        self.displayItems()

    def nextPage(self):
        self.cursor = min(len(self.files), self.cursor + self.itemsPerPage)
        self.displayItems()
        
    def displayItems(self):
        theseFiles = self.files[self.cursor:self.cursor + self.itemsPerPage]
        items = []
        for file, row in zip(theseFiles, self.rows):
            fp = open(file, 'rb')
            try:
                item = self.itemClass.fromFile(fp)
            finally:
                fp.close()
            if item is None:
                continue
            items.append(item)
            guesses = self.guesser.guess(item)
            summary = item.summary()
            cols = item.columnDefs()
            s = ''
            for c, ignore in cols:
                s += summary[c] + ' '
            row.initialize(item, s, guesses, self.guesser.poolNames())
        self.items = items
        
    def quitNow(self):
        if self.dirty:
            if tkMessageBox.askyesno("You have unsaved changes!", "Quit without saving?"):
                self.quit()
        self.quit()

    def columnHeadings(self):
        # FIXME - Something better for columns and rows in general
        line = Frame(self.listView, relief=RAISED, borderwidth=1)
        line.pack(side=TOP, padx=2, pady=1)
        colHeadings = self.itemClass.columnDefs()
        currCol = 0
        for cHdr, width in colHeadings:
            l = Label(line, text=cHdr, width=width, bg='lightblue')
            l.grid(row=0, column=currCol)
            currCol += 1
        line = Frame(self)
        line.pack(fill=X)

    def training(self, row):
        sel = row.selection.get()
        self.guesser.train(sel, row.original)
        row.current = sel
        self.guessAll()

    def guessAll(self):
        self.poolView.refresh()
        pools = self.guesser.poolNames()
        for row in self.rows:
            row.setGuess(self.guesser.guess(row.original), pools)
            
    def displayRow(self, row, bgc=None):
        # UGH - REWRITE!
        line = Frame(self.listView, bg=bgc)
        line.pack(pady=1)
        row.line = line
        self.insertRadios(row)
        Label(line, text=row.summary.get(), textvariable=row.summary, width=60, bg=bgc,
              anchor=W).grid(row=0, column=2)
        #Label(line, text=row.guess, width=7, bg=bgc, anchor=W).grid(row=0, column=1)
        colourStripe = Label(line, text=' ', width=1, bg=bgc, anchor=W, relief=GROOVE)
        colourStripe.grid(row=0, column=1)
        line.colourStripe = colourStripe
        pools = self.guesser.poolNames()
        row.refreshColour(pools)

    def poolAdded(self):
        if not self.items:
            return
        pools = self.guesser.poolNames()
        for row in self.rows:
            for r in row.radios:
                r.destroy()
            self.insertRadios(row)
            row.refreshColour(pools)
        self.dirty = True

    def insertRadios(self, row):
        radioFrame = Frame(row.line)
        radioFrame.grid(row=0, column=0)
        currCol = 0
        radios = []
        v = row.selection
        ci = 0
        colours = row.defaultColours()
        pools = self.guesser.poolNames()
        for pool in pools:
            rb = Radiobutton(radioFrame, text=pool, variable=v, value=pool, command=Command(self.training, row), bg=None)
            rb.grid(row=0, column=currCol)
            radios.append(rb)
            currCol += 1
            ci += 1
        row.radios = radios
Example #52
0
# instantiate URL retriever class
urlRetriever = retriever.URLRetriever()

# try retrieval of url
try:
    corpusSet = urlRetriever.retrieveURL(url)
    corpus = corpusSet['corpus']
    charset = corpusSet['charset']
except IOError:
    error = 1
    errorMessage = 'URL could not be retrieved'

# stop word object
stopWords = retriever.StopWords()

# guess language
guesser = Bayes()
for selectLanguage in languages:
    if selectLanguage != 'automatic':
        stopWordString = stopWords.getStopWordString(selectLanguage)
        guesser.train(selectLanguage, stopWordString)
        language = guesser.guess(corpus)

        # print stopword string
        print stopWordString

# print language
# print language.pop(0)[0]
print language
Example #53
0
    def local_search(self, cid, term_unstemmed, recent):
        term = self.stem(term_unstemmed)
        exemplar_pids = self.get_term_exemplars(cid, term)
        if len(exemplar_pids) < 4:
            return self.fulltext(cid, term, recent)

        log_tmp("SEARCH: %s exemplars" % len(exemplar_pids))

        guesser = Bayes()

        for ex_pid in exemplar_pids:
            ex = state.the.get_post(ex_pid, content=True)
            log_tmp("SEARCH: exemplar tokens: [%s]" % ex.tokens())
            guesser.train("relevant", ex.tokens())  # get normalized content from p.
            # TODO Toss in other factors, if possible.

        for neg_ex_pid in state.the.get_random_pids(len(exemplar_pids)):  # probably cacheable, if we use a bigger pool
            guesser.train("random", state.the.get_post(neg_ex_pid, content=True).tokens())

        log_tmp("SEARCH: trained")

        proportions = [
            (tok, (count + 1) / (1.0 * guesser.pools["random"].get(tok, 0) + 1))
            for (tok, count) in guesser.poolData("relevant")
        ]

        proportions = [  # knock out the weak and irrelevant ones before sorting
            (tok, prop) for (tok, prop) in proportions if prop > 2
        ]

        fulltext_fallback = len(proportions) < 3

        if fulltext_fallback:
            query = xapian.Query(xapian.Query.OP_AND, [term])
        else:
            proportions.sort(key=operator.itemgetter(1), reverse=True)
            log_tmp("SEARCH: proportions: " + str(proportions))
            # search for the twelve best words
            query = xapian.Query(xapian.Query.OP_OR, [tok for (tok, prop) in proportions[:12]])

        log_tmp("SEARCH: query: " + str(query))

        enq = xapian.Enquire(self.mainabase)
        enq.set_query(
            #            xapian.Query(xapian.Query.OP_AND,
            query
            #                , ##Something scoring for BROAD_SUPPORT##)
            ##Something scoring for recency, if appropriate
        )
        mset = enq.get_mset(0, 25)

        results = []
        for m in mset:
            doc = m.get_document()
            post = state.the.get_post(int(doc.get_data()), True)

            for (pool, prob) in guesser.guess(post.tokens()):
                if pool == "relevant":
                    rel_prob = prob
            score = rel_prob
            score *= post.broad_support
            if recent:
                score *= _post_age_score(post)
            results.append(SearchResult(post, term, score))
            # results.append( (post, score, "rel: %f  b_s: %f  root age: %f" %
            #                 (rel_prob, post.broad_support, sqrt(age_days)) ) )
        results.sort(lambda x, y: cmp(x.score, y.score), reverse=True)
        return results[:10]
Example #54
0
 def __init__(self):
   self.guesser = Bayes()
Example #55
0
from reverend.thomas import Bayes
guesser = Bayes()

f = open("spam.log",'r')
for line in f:
  guesser.train('spam', line.strip())

f = open("notspam.log",'r')
for line in f:
  guesser.train('notspam', line.strip())

guesser.save('spam.bay')
Example #56
0
def run(corpus,
        verbose=False,
        hkap_file=os.path.join(software, 'libs/PACManData.bay'),
        train=False,
        authors=False,
        exact_names=False,
        first_only=False,
        nyears=10,
        plotit=False,
        hst=False,
        clobber=False,
        rs_exceptions=''):
    f = open(os.path.join(software, 'category_synonyms.txt'), 'r')
    lines = f.readlines()
    f.close()
    acronyms = {}
    for line in lines:
        if line.startswith('#'): continue
        key, value = line.split('=')
        acronyms[key.strip()] = value.strip().split(',')
    uber_categories = acronyms

    stopwords = load_stopwords()

    dguesser = Bayes()
    dguesser.load(hkap_file)

    if not authors:
        if hst:
            ## Below, proposals are retrieved, then parsed.
            abs = parse_abstracts_proposals(corpus)
            text = parse_science_justification_proposals(corpus)
            justification = abs + text
            bayesString = " " + justification
        else:
            f = open(corpus)
            lines = f.readlines()
            f.close()
            text = ''
            for line in lines:
                if line.startswith('#'): continue
                if not line.strip(): continue
                text += line.strip() + ' '
            bayesString = text
        bayesString = work_string(bayesString, stopwords)
        result = dguesser.guess(bayesString)
        result = normalize_result(result)

    else:
        ## assumes input is a person report
        ## if .pkl report not available, creates new one
        import util

        records = []
        results_dict = {}
        results_pkl = corpus.replace(corpus.split('.')[-1], 'pkl')
        if not os.path.isfile(results_pkl) or clobber:
            f = open(corpus)
            lines = f.readlines()
            f.close()
            for line in lines:
                if line.startswith('#'): continue
                if not line.strip(): continue
                info = line.rstrip().split("\t")
                if info[0] == '': continue
                # records.append(info[0].replace(' ','').replace('"','').replace("'",'').lower())
                records.append(info[0].replace('"', '').replace("'",
                                                                '').lower())
            author_dict, cite_dict = util.adscrawl.run_authors(
                records, nyears=nyears, rs_exceptions=rs_exceptions)
            ## author_dict, cite_dict = util.adscrawl.run_exact_authors(records, nyears=nyears)
            pickle.dump(author_dict, open(results_pkl, 'wb'))
            pickle.dump(cite_dict, open('cites.pkl', 'wb'))
        else:
            author_dict = pickle.load(open(results_pkl, 'rb'))
            cite_dict = pickle.load(open('cites.pkl', 'rb'))
        for author in author_dict.keys():
            bayesString = ''
            for abstract in author_dict[author]:
                bayesString = ' ' + abstract

            bayesString = work_string(bayesString, stopwords)
            result = dguesser.guess(bayesString)
            ## result = normalize_result(result)
            results_dict[author] = {}
            results_dict[author]['hkap'] = rec.fromrecords(result)
            try:
                results_dict[author]['cites'] = sorted(cite_dict[author],
                                                       reverse=True)
            except:
                results_dict[author]['cites'] = [0]
        result = results_dict
    return (result, uber_categories)
Example #57
0
            # by default, self.combiner is set to self.robinson
            state['combiner'] = None
    return state


def Bayes__setstate__(self, state):
    self.__dict__.update(state)
    # support the default combiner (an instance method):
    if 'combiner' in state and state['combiner'] is None:
        self.combiner = self.robinson


Bayes.__getstate__ = Bayes__getstate__
Bayes.__setstate__ = Bayes__setstate__

bayes = Bayes()

# Traverses all files and directories starting from a root directory
# Adds normalized files to trainingData dict


def getCorpus(path, classification):

    for root, subFolders, fileNames in os.walk(path):
        for fileName in fileNames:

            # Learn type of file - only want text files
            fileType = mimetypes.guess_type(fileName)

            if (fileType[1] is None and fileType[0] is None) or re.match(
                    combinedMimeRegex, fileType[0]):