Ejemplo n.º 1
0
 def make_page_classifier(self):
     """set up classifier to evaluate whether a page (Response object) is a papers source"""
     classifier = BinaryNaiveBayes(prior_yes=0.6)
     classifier.likelihood(
         "contains at least 2 links to '.pdf' or '.doc'",
         lambda r: len(re.findall(r'href=[^>]+\.(?:pdf|docx?)\b', r.text, re.IGNORECASE)) > 1,
         p_ifyes=0.99, p_ifno=0.2)
     classifier.likelihood(
         "contains 'syllabus'",
         lambda r: 'syllabus' in r.textlower,
         p_ifyes=0.1, p_ifno=0.2)
     classifier.likelihood(
         "contains conference keywords",
         lambda r: r.textlower.count('schedule') + r.textlower.count('break') + r.textlower.count('dinner') > 2,
         p_ifyes=0.01, p_ifno=0.2)
     classifier.likelihood(
         "author name in url",
         lambda r: r.authorname.split()[-1].lower() in r.url.lower(),
         p_ifyes=0.6, p_ifno=0.1)
     return classifier
Ejemplo n.º 2
0
import re
from statistics import median
from scipy.stats import nbinom
#import sys, os.path
#curpath = os.path.abspath(os.path.dirname(__file__))
#libpath = os.path.join(curpath, os.path.pardir)
#sys.path.insert(0, libpath)
from opp.subjectivebayes import BinaryNaiveBayes
from opp.debug import debug, debuglevel

"""
classifier to evaluate whether a pdf/word document is a paper (or
book etc.), as opposed to a handout, a cv, lecture slides etc.
"""

classifier = BinaryNaiveBayes(prior_yes=0.6)

def bad_url(doc):
    pat = re.compile(r'\bcours|\blecture|\btalk|handout|teaching')
    return pat.search(doc.url.lower())
classifier.likelihood('bad url', bad_url, p_ifyes=0.05, p_ifno=0.2)

def bad_anchortext(doc):
    pat = re.compile(r'^site\s*map$|^home|page\b|\bslides\b|handout')
    return pat.search(doc.link.anchortext.lower())
classifier.likelihood('bad anchortext', bad_anchortext, p_ifyes=0.005, p_ifno=0.3)

def good_linkcontext(doc):
    pat = re.compile(r'penultimate|draft|forthcoming')
    return pat.search(doc.link.context.lower())
classifier.likelihood('good link context', good_linkcontext, p_ifyes=0.2, p_ifno=0.05)
Ejemplo n.º 3
0
        return string in doc.link.context.lower()
    return check

def in_beginning(regex):
    reg = re.compile(regex, re.I)
    def check(doc):
        if not doc.content:
            return Ellipsis
        beginning = doc.content[:5000]
        return reg.search(beginning)
    return check


# =========================================================================

bookfilter = BinaryNaiveBayes(prior_yes=0.2)

bookfilter.likelihood('numwords', length, 
                      p_ifyes=nbinom(7, 0.0001), p_ifno=nbinom(1, 0.0001))

# TODO: add more features? "Acknowledgements" section? Occurrences of
# "this book" TOC? Index? ...

# =========================================================================

chapterfilter = BinaryNaiveBayes(prior_yes=0.2)

chapterfilter.likelihood('numwords', length, 
                         p_ifyes=nbinom(2, 0.0002), p_ifno=nbinom(3, 0.0002))

chapterfilter.likelihood('"chapter" occurs in link context', in_context('chapter'),
Ejemplo n.º 4
0
def test_basic():
    nb = BinaryNaiveBayes(prior_yes=0.5)
    nb.likelihood('', lambda x: True, p_ifyes=0.3, p_ifno=0.1)
    # 0.3 * 0.5 / 0.3 * 0.5 + 0.1 * 0.5 = 0.75
    assert 0.749 < nb.test(0) < 0.751
Ejemplo n.º 5
0
def test_medical():
    nb = BinaryNaiveBayes(prior_yes=0.0001)
    nb.likelihood('', lambda x: True, p_ifyes=0.99, p_ifno=0.01)
    assert 0.0097 < nb.test(0) < 0.0099