Ejemplo n.º 1
def main():
    url, model = sys.argv[1], sys.argv[2]
    classifier = NaiveBayes()

    page = urlopen(url).read()

    soup = BeautifulSoup(page)
    tags = [tag.name for tag in soup.findAll(True)]
    classification = classifier.classify(tags)

    print("Classified as: %s" % classification)
Ejemplo n.º 2
from naivebayes import NaiveBayes

from TwitterAPI import TwitterAPI

api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)

training_set = []

r = api.request('statuses/filter', {'locations':'-74,40,-73,41'})

nb = NaiveBayes()

cin = ''
for item in r:
    print item['text']
    cin = raw_input('Basic? Y/n/quit: ')
    if cin == 'n':
        training_set.append(('non-basic', item['text']))
    elif cin == 'quit':
        training_set.append(('basic', item['text']))

Ejemplo n.º 3
def main(argv):
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} '
              '[-d] [-o path] [-i path] [-r ratio] [-s supports] [-v vars] '
              'feats.db [items ...]')
        return 100

        (opts, args) = getopt.getopt(argv[1:], 'do:i:r:s:v:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    outpath = None
    inpath = None
    ratio = 0.5
    maxsupports = 3
    types = {}
    C = 0.4  # distance weight should be (1.5**d) ~= exp(0.4*d)
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-o': outpath = v
        elif k == '-i': inpath = v
        elif k == '-r': ratio = float(v)
        elif k == '-s': maxsupports = int(v)
        elif k == '-v': types = getvars(v)
    assert inpath is None or outpath is None
    if outpath is not None and os.path.exists(outpath):
        print('Already exists: %r' % outpath)
        return 1
    if not args: return usage()
    dbpath = args.pop(0)
    db = FeatDB(dbpath)
    nallitems = len(db.get_items())
    defaultnames = getdefaultnames(types)
    items = db.get_items()
    if args:
        items = [(tid, item) for (tid, item) in items if item in args]

    def learn(tid, item, fids):
        name = stripid(item)
        words = splitwords(name)
        (count, _) = fids[0]
        feats = {feat: fc for (feat, (fc, _)) in fids.items()}
        for w in words:
            nb.adddict(w, count, feats)
        return True

    def predict(tid, item, fids):
        name = stripid(item)
        words = splitwords(name)
        (count, _) = fids[0]
        feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0}
        # Use only prominent features that appears more than a certain threshold.
        threshold = int(max(feats.values()) * ratio)
        f2 = [feat for (feat, fc) in feats.items() if threshold <= fc]
        for w in words:
            nb.removedict(w, count, feats)
        cands = nb.getkeyfeats(f2)[:len(words)]
        for w in words:
            nb.adddict(w, count, feats)
        if not cands: return False
        cwords = [w for (_, w, _) in cands]
        topword = cwords[0]
        if topword in words: return True
        print('+ITEM', json.dumps(item))
        print('+WORDS', json.dumps(words))
        print('+CANDS', json.dumps(cwords))
        if item in defaultnames:
            print('+DEFAULT', json.dumps(defaultnames[item]))
        fids0 = db.get_feats(tid, source=True)
        srcs0 = {0: [], 1: [], -1: []}
        for (fid, (_, srcs)) in fids0.items():
            if fid == 0:
                d = 0
                d = db.get_feat(fid)[0]
            if d in srcs0:
            json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()]))
        supports = []
        for (_, w, a) in cands:
            # Find top N features for each word.
            fs = []
            for (fid, c) in a[1:]:
                feat = db.get_feat(fid)
                assert feat is not None
                # A rarer feature overall means stronger indication.
                df = math.log(nallitems / db.get_numfeatitems(fid))
                # A more prominent feature for this category means stronger indication.
                ff = c / nb.fcount[fid][None]
                # Discount a "distant" feature from the subject.
                ds = math.exp(-C * abs(feat[0]))
                fs.append((ds * df * ff, fid, feat))
            fs = sorted(fs, reverse=True)[:maxsupports]
            score = sum(s for (s, _, _) in fs)
            # Find the variables that contains the same feature.
            ss = []
            for (_, fid, _) in fs:
                found = None
                (_, srcs0a) = fids0[0]
                (_, srcs0b) = fids0[fid]
                tids = db.get_featitems(fid)
                for tid1 in tids.keys():
                    if tid1 == tid: continue
                    item1 = db.get_item(tid1)
                    name1 = stripid(item1)
                    if w not in splitwords(name1): continue
                    fids1 = db.get_feats(tid1, source=True)
                    (_, srcs1a) = fids1[0]
                    (_, srcs1b) = fids1[fid]
                    found = (srcs0a + srcs0b, item1, srcs1a + srcs1b)
            supports.append((w, score, list(zip(fs, ss))))
        print('+SCORE', json.dumps(sum(score for (_, score, _) in supports)))
        print('+SUPPORTS', json.dumps(supports))
        return False

    nb = NaiveBayes()
    proc = learn
    if inpath is not None:
        print(f'Importing model: {inpath!r}', file=sys.stderr)
        with open(inpath, 'rb') as fp:
            proc = predict

    n = m = 0
    for (tid, item) in items:
        fids = db.get_feats(tid)
        n += 1
        if proc(tid, item, fids):
            m += 1
    print(f'\nProcessed: {m}/{n}', file=sys.stderr)

    if outpath is not None:
        print(f'Exporting model: {outpath!r}', file=sys.stderr)
        with open(outpath, 'wb') as fp:

    if inpath is None and outpath is None:
        for (tid, item) in items:
            fids = db.get_feats(tid)
            predict(tid, item, fids)

    return 0