Esempio n. 1
0
 def __init__(self, parent, guesser=None, itemClass=None):
     self.status = StatusBar(parent)
     self.status.pack(side=BOTTOM, fill=X)
     Frame.__init__(self, parent)
     self.pack(side=TOP, fill=BOTH)
     self.itemsPerPage = 20
     self.rows = []
     for i in range(self.itemsPerPage):
         self.rows.append(ItemRow())
     self.items = []
     self.files = []
     self.cursor = 0
     self.dirty = False
     if guesser is None:
         from reverend.thomas import Bayes
         self.guesser = Bayes()
     else:
         self.guesser = guesser
     if itemClass is None:
         self.itemClass = TextItem
     else:
         self.itemClass = itemClass
     for row in self.rows:
         row.summary.set('foo')
     self.initViews()
Esempio n. 2
0
 def test_untrainedGuess(self):
     """
     The C{guess} method of a L{Bayes} instance with no training data returns
     an empty list.
     """
     bayes = Bayes()
     self.assertEquals(bayes.guess("hello, world"), [])
Esempio n. 3
0
def main():
    """
    Build aggregator report pages with Bayes rating links.
    """
    # Create a new Bayes guesser
    guesser = Bayes()

    # Attempt to load Bayes data, ignoring IOError on first run.
    try:
        guesser.load(BAYES_DATA_FN)
    except IOError:
        pass

    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)

    # Score the new entries using the Bayesian guesser
    entries = scoreEntries(guesser, entries)

    # Write out the current run's aggregator report.
    out_fn = time.strftime(HTML_FN)
    writeAggregatorPage(entries, out_fn, DATE_HDR_TMPL, FEED_HDR_TMPL,
                        ENTRY_TMPL, PAGE_TMPL)

    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
    guesser.save(BAYES_DATA_FN)
Esempio n. 4
0
 def __init__(self, non_spam_train_dir, spam_train_dir):
     self.non_spam_train_dir = non_spam_train_dir
     self.spam_train_dir = spam_train_dir
     self.naive_bayes_classifier = Bayes()
     self.total_num_train_files = 0
     self.total_num_test_files = 0
     self.num_misclass = 0
Esempio n. 5
0
    def getCategoryGuesses(self, corpus1, corpus2, corpus3):
        from reverend.thomas import Bayes

        # instantiate guesser
        guesser = Bayes()

        # train category guesser with first corpus
        guesser.train('first reference text', corpus1)
        guesser.train('second reference text', corpus2)

        # compare with second corpus
        guesses = guesser.guess(corpus3)

        return guesses
Esempio n. 6
0
def get_db(private_path, username):
    path = os.path.join(os.path.join(private_path, username), 'spam.bayes')
    guesser = Bayes()

    # load the spam DB
    try:
        guesser.load(path)
    except IOError:
        print "Creating a new spam filter database"

        parent_directory = os.path.dirname(path)
        if not os.path.isdir(parent_directory):
            os.makedirs(parent_directory)

        guesser.save(path)

    return guesser, path
Esempio n. 7
0
    def trained(self, cr, uid, ids, context=None):
        for id in ids:
            record = self.read(cr, uid, id, ['category_id', 'description'])
            if not record['description']:
                raise osv.except_osv(_('Error!'), _("Description Not Define!"))
            if not record['category_id']:
                raise osv.except_osv(_('Error!'),
                                     _("Statistics Category Not Define!"))
            group_obj = self.pool.get('crm.bayes.group')
            cat_obj = self.pool.get('crm.bayes.categories')
            cat_rec = cat_obj.read(cr, uid, record['category_id'][0], [])
            guesser = Bayes()
            data = ""
            for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                if rec['train_data']:
                    data += rec['train_data']
            if data:
                myfile = file(file_path + "crm_bayes.bay", 'w')
                myfile.write(data)
                myfile.close()
                guesser.load(file_path + "crm_bayes.bay")
            guesser.train(cat_rec['name'], record['description'])
            guesser.save(file_path + "crm_bayes.bay")
            myfile = file(file_path + "crm_bayes.bay", 'r')
            data = ""
            for fi in myfile.readlines():
                data += fi
            cat_obj.write(
                cr, uid, record['category_id'][0],
                {'train_messages': int(cat_rec['train_messages']) + 1})
            cr.execute(
                "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"
                % cat_rec['group_id'][0])
            rec = cr.dictfetchall()
            if not rec[0]['tot_guess']:
                rec[0]['tot_guess'] = 0
            percantage = float(
                rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] +
                                                   rec[0]['tot_train'])
            group_obj.write(cr, uid, cat_rec['group_id'][0], {
                'train_data': data,
                'automate_test': percantage
            })

            self.write(cr, uid, id, {'state_bayes': 'trained'})
        return True
Esempio n. 8
0
    def getLanguageGuesses(self, stopWords, corpus, languages):
        from reverend.thomas import Bayes

        # charset
        charset = 'us-ascii'

        # instantiate guesser
        guesser = Bayes()

        # go through language in order to train guesser
        for selectLanguage in languages:
            if selectLanguage != 'automatic':
                stopWordString = stopWords.getStopWordString(selectLanguage)
                guesser.train(selectLanguage,
                              stopWordString.encode(charset, 'replace'))

        # get list of possible languages
        languageGuesses = guesser.guess(corpus.encode(charset, 'replace'))

        return languageGuesses
Esempio n. 9
0
    def action_guess(self, cr, uid, ids, context=None):
        guesser = Bayes()
        group_obj = self.pool.get('crm.bayes.group')
        if result:
            for res in range(0, len(result)):
                result.pop(0)
        data = ""
        for rec in group_obj.browse(cr, uid, context['active_ids']):
            if rec['train_data']:
                data += rec['train_data']
        result_lang = []
        if data:
            myfile = file("/tmp/crm_bayes.bay", 'w')
            myfile.write(data)
            myfile.close()
            guesser.load('/tmp/crm_bayes.bay')
            message = self.read(cr, uid, ids, ['name'])
            result_lang = guesser.guess(message[0]['name'])

        cat_obj = self.pool.get('crm.bayes.categories')
        cat_id = cat_obj.search(cr, uid, [])
        for re in cat_obj.read(cr, uid, cat_id, ['name']):
            flag = False
            for r in result_lang:
                if r[0] == re['name']:
                    result.append(r)
                    flag = True
                    break
            if not flag:
                result.append((re['name'], 0))
        context_new = {}
        context_new.update({'from_wiz': True})
        context_new.update({'group_id': context.get('active_id', False)})
        return {
            'context': context_new,
            'view_type': 'form',
            "view_mode": 'form',
            'res_model': 'crm.bayes.test.train',
            'type': 'ir.actions.act_window',
            'target': 'new',
        }
Esempio n. 10
0
 def guess_message(self, cr, uid, ids, context={}):
     cases = self.browse(cr, uid, ids)
     result_lang = []
     if cases.description:
         guesser = Bayes()
         group_obj = self.pool.get('crm.bayes.group')
         data = ""
         for rec in group_obj.browse(
                 cr, uid, group_obj.search(cr, uid,
                                           [('active', '=', True)])):
             if rec['train_data']:
                 data += rec['train_data']
         if data:
             myfile = file("/tmp/crm_bayes.bay", 'w')
             myfile.write(data)
             myfile.close()
             guesser.load('/tmp/crm_bayes.bay')
             result_lang = guesser.guess(cases.description)
     guess_re = []
     for le in result_lang:
         guess_re.append((le[0], le[1] * 100))
     return guess_re
Esempio n. 11
0
def main():
    """
    Perform a test run of the FeedFilter using defaults.
    """
    # Create a new Bayes guesser, attempt to load data
    guesser = Bayes()
    guesser.load(BAYES_DATA_FN)

    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)

    # Build the feed filter.
    f = BayesFilter(guesser, entries)
    f.FEED_META['feed.title'] = FEED_TITLE
    f.FEED_META['feed.tagline'] = FEED_TAGLINE

    # Output the feed as both RSS and Atom.
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())

    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
Esempio n. 12
0
'Goal:  Build a language recognizer using a naive bayesian classifier'

# Make a 50 language reconizer trained on 10 books per language at:
# http://www.gutenberg.org/browse/languages/en
# http://www.gutenberg.org/files/1342/1342-0.txt

from reverend.thomas import Bayes

# Train the classifier
language_sniffer = Bayes()
for lang in ['en', 'es', 'fr', 'de', 'it']:
    filename = 'notes/proverbs_%s.txt' % lang
    with open(filename) as f:
        data = f.read().decode('utf-8')
        language_sniffer.train(lang, data)

# Apply the classifier
phrases = u'''\
All the leaves are brown and the sky is gray.  I've been for a walk on a winter's day.
De colores, todos los colores. De colores se visten los campos en la primavera.
Jingle bells, jingle all the way. Oh what fun it is to ride in a one horse open sleigh.
Casca belles, hoy es navidad.  Es un dia, de allegria y felicidad.
'''.splitlines()

for phrase in phrases:
    best_guess = language_sniffer.guess(phrase)[0][0]
    print best_guess, '<--', phrase[:30]
"""
pip install reverend
pip install sets
Source Code :https://laslabs.github.io/python-reverend/_modules/reverend/thomas.html
Overview of Bayes Rule: https://towardsdatascience.com/bayes-rule-with-a-simple-and-practical-example-2bce3d0f4ad0
"""
from reverend.thomas import Bayes
g = Bayes()    # guesser
g.train('french','La souris est rentre dans son trou.')
g.train('english','my tailor is rich.')
g.train('french','Je ne sais pas si je viendrai demain.')
g.train('english','I do not plan to update my website soon and I would really like some help from the rest of you idiots.')

print(g.guess('Jumping out of cliffs it not a good idea.'))

# print(g.guess('Demain il fera trs probablement chaud.'))
Esempio n. 14
0
====== RESTART: /Users/raymond/Dropbox/Public/army2/decorator_school.py ======
>>> 
>>> y = big_func(10)
Doing hard work
INFO:root:Called big_func() with (10,) giving 11 in 1.074376 seconds
>>> y = big_func(20)
Doing hard work
INFO:root:Called big_func() with (20,) giving 21 in 1.100503 seconds
>>> show_cache(big_func)
{10: 11, 20: 21}
SyntaxError: invalid syntax
>>> 

>>> 
>>> from reverend.thomas import Bayes
>>> gender = Bayes()
>>> gender.train('male', 'bill hank chris mark martin pat adam hank chris zack sean')
>>> gender.train('female', 'mindy shelly pat mary daisy amber chris pat becky sue')
>>> gender.guess('hank')
[('male', 0.9999)]
>>> gender.guess('mindy')
[('female', 0.9999)]
>>> gender.guess('pat')
[('female', 0.6451612903225806), ('male', 0.35483870967741926)]
>>> gender.guess('chris')
[('male', 0.6875000000000001), ('female', 0.3125)]
>>> gender.train('male', 'red red orange yellow red orange blue black brown blue red yellow')
>>> gender.train('female', 'pink red green green blue blue chartreuse green blue yellow orange blue green')
>>> gender.guess('red')
[('male', 0.8), ('female', 0.19999999999999996)]
>>> gender.guess('pink')
Esempio n. 15
0
 def __init__(self):
     self.guesser = Bayes()
Esempio n. 16
0
from reverend.thomas import Bayes
guesser = Bayes()
guesser.train('french', 'le la les du un une je il elle de en')
guesser.train('german', 'der die das ein eine')
guesser.train('spanish', 'el uno una las de la en')
guesser.train('english', 'the it she he they them are were to')
guesser.guess('they went to el cantina')
guesser.guess('they were flying planes')
guesser.train('english', 'the rain in spain falls mainly on the plain')
guesser.save('my_guesser.bay')
Esempio n. 17
0
    def action_train(self, cr, uid, ids, context=None):
        cat_obj = self.pool.get('crm.bayes.categories')
        group_obj = self.pool.get('crm.bayes.group')
        message_obj = self.pool.get('crm.bayes.test.guess')

        for id in ids:
            cat_id = self.read(cr, uid, id, ['category_id', 'name'])
            cat_id = cat_id[0]['category_id']
            if result:
                max_list = max(result, key=lambda k: k[1])
                if cat_id:
                    cat_guess_msg = cat_obj.read(cr, uid, cat_id,
                                                 ['train_messages'])
                    cat_obj.write(cr, uid, cat_id, {
                        'train_messages':
                        cat_guess_msg['train_messages'] + 1
                    })
                if max_list[1] > 0 and not cat_id:
                    cat_id = cat_obj.search(cr, uid,
                                            [('name', '=', max_list[0])])[0]
                    cat_guess_msg = cat_obj.read(cr, uid, cat_id,
                                                 ['guess_messages'])
                    cat_obj.write(cr, uid, cat_id, {
                        'guess_messages':
                        cat_guess_msg['guess_messages'] + 1
                    })
                    self.write(cr, uid, ids, {'category_id': cat_id})
            if cat_id:
                cat_rec = cat_obj.read(cr, uid, cat_id, [])
                guesser = Bayes()
                data = ""
                for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]):
                    if rec['train_data']:
                        data += rec['train_data']
                if data:
                    myfile = file(file_path + "crm_bayes.bay", 'w')
                    myfile.write(data)
                    myfile.close()
                    guesser.load(file_path + "crm_bayes.bay")

                guesser.train(cat_rec['name'],
                              message_obj.read(cr, uid, id)[0]['name'])
                guesser.save(file_path + "crm_bayes.bay")
                myfile = file(file_path + "crm_bayes.bay", 'r')
                data = ""
                for fi in myfile.readlines():
                    data += fi
                cr.execute(
                    "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d"
                    % cat_rec['group_id'][0])
                rec = cr.dictfetchall()
                if not rec[0]['tot_guess']:
                    rec[0]['tot_guess'] = 0
                percantage = float(
                    rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] +
                                                       rec[0]['tot_train'])
                group_obj.write(cr, uid, cat_rec['group_id'][0], {
                    'train_data': data,
                    'automate_test': percantage
                })
            else:
                raise osv.except_osv(_('Error !'),
                                     _('Please Select Category! '))
        return {
            'view_type': 'form',
            "view_mode": 'form',
            'res_model': 'crm.bayes.train.message',
            'type': 'ir.actions.act_window',
            'target': 'new',
        }
Esempio n. 18
0
def run(corpus,
        verbose=False,
        hkap_file=os.path.join(software, 'libs/PACManData.bay'),
        train=False,
        authors=False,
        exact_names=False,
        first_only=False,
        nyears=10,
        plotit=False,
        hst=False,
        clobber=False,
        rs_exceptions=''):
    f = open(os.path.join(software, 'category_synonyms.txt'), 'r')
    lines = f.readlines()
    f.close()
    acronyms = {}
    for line in lines:
        if line.startswith('#'): continue
        key, value = line.split('=')
        acronyms[key.strip()] = value.strip().split(',')
    uber_categories = acronyms

    stopwords = load_stopwords()

    dguesser = Bayes()
    dguesser.load(hkap_file)

    if not authors:
        if hst:
            ## Below, proposals are retrieved, then parsed.
            abs = parse_abstracts_proposals(corpus)
            text = parse_science_justification_proposals(corpus)
            justification = abs + text
            bayesString = " " + justification
        else:
            f = open(corpus)
            lines = f.readlines()
            f.close()
            text = ''
            for line in lines:
                if line.startswith('#'): continue
                if not line.strip(): continue
                text += line.strip() + ' '
            bayesString = text
        bayesString = work_string(bayesString, stopwords)
        result = dguesser.guess(bayesString)
        result = normalize_result(result)

    else:
        ## assumes input is a person report
        ## if .pkl report not available, creates new one
        import util

        records = []
        results_dict = {}
        results_pkl = corpus.replace(corpus.split('.')[-1], 'pkl')
        if not os.path.isfile(results_pkl) or clobber:
            f = open(corpus)
            lines = f.readlines()
            f.close()
            for line in lines:
                if line.startswith('#'): continue
                if not line.strip(): continue
                info = line.rstrip().split("\t")
                if info[0] == '': continue
                # records.append(info[0].replace(' ','').replace('"','').replace("'",'').lower())
                records.append(info[0].replace('"', '').replace("'",
                                                                '').lower())
            author_dict, cite_dict = util.adscrawl.run_authors(
                records, nyears=nyears, rs_exceptions=rs_exceptions)
            ## author_dict, cite_dict = util.adscrawl.run_exact_authors(records, nyears=nyears)
            pickle.dump(author_dict, open(results_pkl, 'wb'))
            pickle.dump(cite_dict, open('cites.pkl', 'wb'))
        else:
            author_dict = pickle.load(open(results_pkl, 'rb'))
            cite_dict = pickle.load(open('cites.pkl', 'rb'))
        for author in author_dict.keys():
            bayesString = ''
            for abstract in author_dict[author]:
                bayesString = ' ' + abstract

            bayesString = work_string(bayesString, stopwords)
            result = dguesser.guess(bayesString)
            ## result = normalize_result(result)
            results_dict[author] = {}
            results_dict[author]['hkap'] = rec.fromrecords(result)
            try:
                results_dict[author]['cites'] = sorted(cite_dict[author],
                                                       reverse=True)
            except:
                results_dict[author]['cites'] = [0]
        result = results_dict
    return (result, uber_categories)
Esempio n. 19
0
            # by default, self.combiner is set to self.robinson
            state['combiner'] = None
    return state


def Bayes__setstate__(self, state):
    self.__dict__.update(state)
    # support the default combiner (an instance method):
    if 'combiner' in state and state['combiner'] is None:
        self.combiner = self.robinson


Bayes.__getstate__ = Bayes__getstate__
Bayes.__setstate__ = Bayes__setstate__

bayes = Bayes()

# Traverses all files and directories starting from a root directory
# Adds normalized files to trainingData dict


def getCorpus(path, classification):

    for root, subFolders, fileNames in os.walk(path):
        for fileName in fileNames:

            # Learn type of file - only want text files
            fileType = mimetypes.guess_type(fileName)

            if (fileType[1] is None and fileType[0] is None) or re.match(
                    combinedMimeRegex, fileType[0]):
Esempio n. 20
0
)



neg_file = open(BASE_DIR+"/data/rt-polarity.neg").read()
pos_file = open(BASE_DIR+"/data/rt-polarity.pos").read()
neg_tweets_list = str(neg_file).split('\n')
pos_tweets_list = str(pos_file).split('\n')

neg_cutoff = int(neg_tweets_list.__len__()*3/4)
pos_cutoff = int(pos_tweets_list.__len__()*3/4)

neg_train = neg_tweets_list[:neg_cutoff]
pos_train = pos_tweets_list[:neg_cutoff]


neg_test = neg_tweets_list[neg_cutoff:]
pos_test = pos_tweets_list[pos_cutoff:]
tweet_data = {'neg_train':neg_train,'pos_train':pos_train,'neg_test':neg_test,'pos_test':pos_test}



bestwords = get_best_words(pos_train, neg_train)
single_classifier = Bayes()
single_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_classifier.dat")
non_stop_classifier = Bayes(tokenizer=non_stop_tokenizer())
non_stop_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_stop_classifier.dat")
best_classifier = Bayes(tokenizer=best_tokenizer(best_words=bestwords))
best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_best_classifier.dat")
bigram_best_classifier = Bayes(tokenizer=best_bigram_tokenizer(best_words=bestwords))
bigram_best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_bi_classifier.dat")
Esempio n. 21
0
# nthcolumn
import mechanize
import cookielib
import time
import os, sys, requests, pattern, json, tweepy
import numpy as np

from random import randint
from pattern.en import sentiment
from bs4 import BeautifulSoup

from six.moves.html_parser import HTMLParser
h = HTMLParser()

from reverend.thomas import Bayes
ai = Bayes()

from hackernews import HackerNews
hn = HackerNews()

with open('./Documents/tattle/config.json') as data_file:
    settings = json.load(data_file)

consumer_key = settings['twitter']['consumer_key']
consumer_secret = settings['twitter']['consumer_secret']
access_key = settings['twitter']['access_token_key']
access_secret = settings['twitter']['access_token_secret']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitter = tweepy.API(auth)