Beispiel #1
0
def test_debug(caplog):
    debuglevel(4)
    debug(4, 'hi there')
    assert 'hi there' in caplog.text()
    debug(5, 'secret')
    assert 'secret' not in caplog.text()
    debuglevel(5)
Beispiel #2
0
def test_linkcontext(basefile, title):
    debuglevel(5)
    html = readfile(os.path.join(testdir, basefile+'.html'))
    content = readfile(os.path.join(testdir, basefile+'.txt')).strip()
    doc = models.Doc(title=title)
    res = blogpostparser.extract_content(html, doc)
    assert re.sub(r'\s+', ' ', content) == re.sub(r'\s+', ' ', res)
Beispiel #3
0
def test_linkcontext(page, link, context, caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    debuglevel(5)
    curpath = os.path.abspath(os.path.dirname(__file__))
    testdir = os.path.join(curpath, 'sourcepages')
    browser = Browser(reuse_browser=True, use_virtual_display=VDISPLAY)
    src = 'file://'+testdir+'/'+page
    browser.goto(src)
    el = browser.find_elements_by_xpath('//a[@href="{}"]'.format(link))[0]
    li = Link(element=el)
    res = li.html_context()
    assert res == context
Beispiel #4
0
def evaluate(doc):
    debug(4, 'trying to guess document type')
    probs = {
        'book': bookfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'chapter': chapterfilter.test(doc, debug=debuglevel()>3, smooth=True),
        'thesis': thesisfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'review': reviewfilter.test(doc, debug=debuglevel()>3, smooth=True)
    }
    debug(2, 'doctyper: %s', ', '.join(['{} {}'.format(k,v) for k,v in probs.items()]))
    if max(probs.values()) > 0.5:
        return max(probs, key=probs.get)
    else:
        return 'article'
Beispiel #5
0
def test_process_link(testdb, caplog):
    source = Source(url='http://umsu.de/papers/')
    source.load_from_db()
    browser = scraper.Browser(use_virtual_display=VDISPLAY)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    link = 'options.pdf'
    el = browser.find_element_by_xpath("//a[@href='{}']".format(link))
    url = source.make_absolute(link)
    li = Link(url=url, source=source, element=el)
    li.load_from_db()
    debuglevel(2)
    scraper.process_link(li, force_reprocess=True, keep_tempfiles=True)
    debuglevel(5)
    assert 'Options and Actions' in caplog.text()
    assert 'But even if we know' in caplog.text()
Beispiel #6
0
def extractor(xmlfile):
    cmd = [PERL, join(path, 'Extractor.pm'), "-v{}".format(debuglevel()), xmlfile]
    debug(2, ' '.join(cmd))
    try:
        output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=60)
        output = output.decode('utf-8', 'ignore')
    except subprocess.CalledProcessError as e:
        debug(1, e.output)
        return False
    except subprocess.TimeoutExpired as e:
        debug(1, 'Extractor timeout!')
        return False
    json_separator = '=========== RESULT ===========\n'
    if not json_separator in output:
        debug(1, 'Extractor failed:\n%s', output)
        return False
    log,jsonstr = output.split(json_separator, 1)
    debug(1, log)
    res = json.loads(jsonstr)
    return res
Beispiel #7
0
logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('filename', help='file to process')
ap.add_argument('-d', '--debuglevel', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-u', '--url', type=str, help='link url')
ap.add_argument('-l', '--linkcontext', type=str, help='link context')
ap.add_argument('-a', '--anchortext', type=str, help='anchortext')
ap.add_argument('-s', '--sourcehtml', type=str, help='source page html')
args = ap.parse_args()

debug.debuglevel(args.debuglevel or 2)

# set up doc for processing:
filetype = 'pdf' if 'pdf' in args.filename else 'doc'
doc = scraper.Doc(filetype=filetype)
doc.link = scraper.Link(url=args.url or 'foo')
doc.link.context = args.linkcontext or 'foo'
doc.link.anchortext = args.anchortext or 'foo'
doc.source = scraper.Source(url='foo', html=(args.sourcehtml or 'foo'))
doc.tempfile = args.filename

# process
scraper.process_file(doc, keep_tempfiles=args.keep)
Beispiel #8
0
 def start(self):
     debuglevel(3)
     super().start()
     self.run()
Beispiel #9
0
#!/usr/bin/env python3
import sys
import logging
import findmodules
from opp import db, debug
from opp.doctyper import classifier

logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

debug.debuglevel(4)

cur = db.cursor()
query = ("SELECT label FROM cats")
cur.execute(query)
for row in cur.fetchall():
   classifier.update_classifier(row[0])
Beispiel #10
0
#!/usr/bin/env python3
import pytest
import os.path
from opp.doctyper import doctyper
from opp import scraper
from opp.debug import debuglevel

debuglevel(3)

curpath = os.path.abspath(os.path.dirname(__file__))
testdir = os.path.join(curpath, 'testdocs')

def test_simplepaper():
    doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf')
    doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf')
    doc.link.anchortext = 'Download'
    doc.link.context = 'Foo bar'
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    doc.numwords = 10200
    doc.numpages = 22
    assert doctyper.evaluate(doc) == 'article'

def test_pretendbook():
    doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf')
    doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf')
    doc.link.anchortext = 'Download'
    doc.link.context = 'Foo bar'
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) * 10
    doc.numwords = 10200 * 10
    doc.numpages = 22 * 10
    assert doctyper.evaluate(doc) == 'book'
Beispiel #11
0
def evaluate(doc):
    debug(4, 'trying to guess if document is a paper')
    debugflag = debuglevel() > 3
    return classifier.test(doc, debug=debugflag, smooth=True)
Beispiel #12
0
import os.path
import sys
import json
from datetime import datetime
from opp.models import Source, Link, Doc
from opp.debug import debuglevel
from opp import db

"""
To run these tests, create a test database called test_opp and
give the standard mysql user access to it.
"""

VDISPLAY = True

debuglevel(5)

curpath = os.path.abspath(os.path.dirname(__file__))
testdir = os.path.join(curpath, 'testdocs')

@pytest.fixture(scope='module')
def testdb():
    """set up test database"""
    db.close()
    db.connection(db='test_opp')
    cur = db.cursor()
    for t in ('sources', 'links', 'docs'):
        cur.execute('DELETE FROM {}'.format(t))
    db.commit()
    Source(
        url='http://umsu.de/papers/',
Beispiel #13
0
from opp import db, scraper, debug

logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('url', help='(part of) url of source page to scrape')
ap.add_argument('-d', '--debug_level', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-l', '--link', type=str, help='only process this link')
args = ap.parse_args()

debug.debuglevel(args.debug_level)

cur = db.dict_cursor()
query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1"
cur.execute(query, ('%'+args.url+'%',))
sources = cur.fetchall()
if not sources:
   raise Exception(args.url+' not in sources table')
source = scraper.Source(**sources[0])

if args.link:
    browser = scraper.Browser(use_virtual_display=True)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    try:
        el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))