def test_debug(caplog): debuglevel(4) debug(4, 'hi there') assert 'hi there' in caplog.text() debug(5, 'secret') assert 'secret' not in caplog.text() debuglevel(5)
def test_linkcontext(basefile, title): debuglevel(5) html = readfile(os.path.join(testdir, basefile+'.html')) content = readfile(os.path.join(testdir, basefile+'.txt')).strip() doc = models.Doc(title=title) res = blogpostparser.extract_content(html, doc) assert re.sub(r'\s+', ' ', content) == re.sub(r'\s+', ' ', res)
def test_linkcontext(page, link, context, caplog): caplog.setLevel(logging.CRITICAL, logger='selenium') caplog.setLevel(logging.DEBUG, logger='opp') debuglevel(5) curpath = os.path.abspath(os.path.dirname(__file__)) testdir = os.path.join(curpath, 'sourcepages') browser = Browser(reuse_browser=True, use_virtual_display=VDISPLAY) src = 'file://'+testdir+'/'+page browser.goto(src) el = browser.find_elements_by_xpath('//a[@href="{}"]'.format(link))[0] li = Link(element=el) res = li.html_context() assert res == context
def evaluate(doc): debug(4, 'trying to guess document type') probs = { 'book': bookfilter.test(doc, debug=debuglevel()>3, smooth=False), 'chapter': chapterfilter.test(doc, debug=debuglevel()>3, smooth=True), 'thesis': thesisfilter.test(doc, debug=debuglevel()>3, smooth=False), 'review': reviewfilter.test(doc, debug=debuglevel()>3, smooth=True) } debug(2, 'doctyper: %s', ', '.join(['{} {}'.format(k,v) for k,v in probs.items()])) if max(probs.values()) > 0.5: return max(probs, key=probs.get) else: return 'article'
def test_process_link(testdb, caplog): source = Source(url='http://umsu.de/papers/') source.load_from_db() browser = scraper.Browser(use_virtual_display=VDISPLAY) browser.goto(source.url) source.set_html(browser.page_source) link = 'options.pdf' el = browser.find_element_by_xpath("//a[@href='{}']".format(link)) url = source.make_absolute(link) li = Link(url=url, source=source, element=el) li.load_from_db() debuglevel(2) scraper.process_link(li, force_reprocess=True, keep_tempfiles=True) debuglevel(5) assert 'Options and Actions' in caplog.text() assert 'But even if we know' in caplog.text()
def extractor(xmlfile): cmd = [PERL, join(path, 'Extractor.pm'), "-v{}".format(debuglevel()), xmlfile] debug(2, ' '.join(cmd)) try: output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=60) output = output.decode('utf-8', 'ignore') except subprocess.CalledProcessError as e: debug(1, e.output) return False except subprocess.TimeoutExpired as e: debug(1, 'Extractor timeout!') return False json_separator = '=========== RESULT ===========\n' if not json_separator in output: debug(1, 'Extractor failed:\n%s', output) return False log,jsonstr = output.split(json_separator, 1) debug(1, log) res = json.loads(jsonstr) return res
logger = logging.getLogger('opp') logger.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) logger.addHandler(ch) ap = argparse.ArgumentParser() ap.add_argument('filename', help='file to process') ap.add_argument('-d', '--debuglevel', default=1, type=int) ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files') ap.add_argument('-u', '--url', type=str, help='link url') ap.add_argument('-l', '--linkcontext', type=str, help='link context') ap.add_argument('-a', '--anchortext', type=str, help='anchortext') ap.add_argument('-s', '--sourcehtml', type=str, help='source page html') args = ap.parse_args() debug.debuglevel(args.debuglevel or 2) # set up doc for processing: filetype = 'pdf' if 'pdf' in args.filename else 'doc' doc = scraper.Doc(filetype=filetype) doc.link = scraper.Link(url=args.url or 'foo') doc.link.context = args.linkcontext or 'foo' doc.link.anchortext = args.anchortext or 'foo' doc.source = scraper.Source(url='foo', html=(args.sourcehtml or 'foo')) doc.tempfile = args.filename # process scraper.process_file(doc, keep_tempfiles=args.keep)
def start(self): debuglevel(3) super().start() self.run()
#!/usr/bin/env python3 import sys import logging import findmodules from opp import db, debug from opp.doctyper import classifier logger = logging.getLogger('opp') logger.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) logger.addHandler(ch) debug.debuglevel(4) cur = db.cursor() query = ("SELECT label FROM cats") cur.execute(query) for row in cur.fetchall(): classifier.update_classifier(row[0])
#!/usr/bin/env python3 import pytest import os.path from opp.doctyper import doctyper from opp import scraper from opp.debug import debuglevel debuglevel(3) curpath = os.path.abspath(os.path.dirname(__file__)) testdir = os.path.join(curpath, 'testdocs') def test_simplepaper(): doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf') doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf') doc.link.anchortext = 'Download' doc.link.context = 'Foo bar' doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) doc.numwords = 10200 doc.numpages = 22 assert doctyper.evaluate(doc) == 'article' def test_pretendbook(): doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf') doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf') doc.link.anchortext = 'Download' doc.link.context = 'Foo bar' doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) * 10 doc.numwords = 10200 * 10 doc.numpages = 22 * 10 assert doctyper.evaluate(doc) == 'book'
def evaluate(doc): debug(4, 'trying to guess if document is a paper') debugflag = debuglevel() > 3 return classifier.test(doc, debug=debugflag, smooth=True)
import os.path import sys import json from datetime import datetime from opp.models import Source, Link, Doc from opp.debug import debuglevel from opp import db """ To run these tests, create a test database called test_opp and give the standard mysql user access to it. """ VDISPLAY = True debuglevel(5) curpath = os.path.abspath(os.path.dirname(__file__)) testdir = os.path.join(curpath, 'testdocs') @pytest.fixture(scope='module') def testdb(): """set up test database""" db.close() db.connection(db='test_opp') cur = db.cursor() for t in ('sources', 'links', 'docs'): cur.execute('DELETE FROM {}'.format(t)) db.commit() Source( url='http://umsu.de/papers/',
from opp import db, scraper, debug logger = logging.getLogger('opp') logger.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) logger.addHandler(ch) ap = argparse.ArgumentParser() ap.add_argument('url', help='(part of) url of source page to scrape') ap.add_argument('-d', '--debug_level', default=1, type=int) ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files') ap.add_argument('-l', '--link', type=str, help='only process this link') args = ap.parse_args() debug.debuglevel(args.debug_level) cur = db.dict_cursor() query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1" cur.execute(query, ('%'+args.url+'%',)) sources = cur.fetchall() if not sources: raise Exception(args.url+' not in sources table') source = scraper.Source(**sources[0]) if args.link: browser = scraper.Browser(use_virtual_display=True) browser.goto(source.url) source.set_html(browser.page_source) try: el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))