Example #1
0
def test_process_link(testdb, caplog):
    source = Source(url='http://umsu.de/papers/')
    source.load_from_db()
    browser = scraper.Browser(use_virtual_display=VDISPLAY)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    link = 'options.pdf'
    el = browser.find_element_by_xpath("//a[@href='{}']".format(link))
    url = source.make_absolute(link)
    li = Link(url=url, source=source, element=el)
    li.load_from_db()
    debuglevel(2)
    scraper.process_link(li, force_reprocess=True, keep_tempfiles=True)
    debuglevel(5)
    assert 'Options and Actions' in caplog.text()
    assert 'But even if we know' in caplog.text()
Example #2
0
ap.add_argument('-d', '--debug_level', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-l', '--link', type=str, help='only process this link')
args = ap.parse_args()

debug.debuglevel(args.debug_level)

cur = db.dict_cursor()
query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1"
cur.execute(query, ('%'+args.url+'%',))
sources = cur.fetchall()
if not sources:
   raise Exception(args.url+' not in sources table')
source = scraper.Source(**sources[0])

if args.link:
    browser = scraper.Browser(use_virtual_display=True)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    try:
        el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))
    except Exception as e:
        sys.exit('no link containing '+args.link+' on '+source.url)
    url = source.make_absolute(el.get_attribute('href'))
    li = scraper.Link(url=url, source=source, element=el)
    li.load_from_db()
    scraper.process_link(li, force_reprocess=True, keep_tempfiles=args.keep)
else:
    scraper.scrape(source, keep_tempfiles=args.keep)