Example #1
0
def test_xvfb(caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    b = Browser(use_virtual_display=True)
    src = 'file://'+testdir+'/umsu.html'
    b.goto(src)
    del b
    ps = subprocess.Popen(('ps', 'aux'), stdout=subprocess.PIPE)
    output = ps.communicate()[0]
    for line in output.decode('ascii').split('\n'):
        assert 'Xfvb' not in line
Example #2
0
def test_status(caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    b = Browser(use_virtual_display=VDISPLAY)
    src = 'file://'+testdir+'/umsu.html'
    b.goto(src)
    assert b.status == 200
    src = 'file://'+testdir+'/xxx.html'
    b.goto(src)
    assert b.status == 404
    del b
Example #3
0
def test_linkcontext(page, link, context, caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    debuglevel(5)
    curpath = os.path.abspath(os.path.dirname(__file__))
    testdir = os.path.join(curpath, 'sourcepages')
    browser = Browser(reuse_browser=True, use_virtual_display=VDISPLAY)
    src = 'file://'+testdir+'/'+page
    browser.goto(src)
    el = browser.find_elements_by_xpath('//a[@href="{}"]'.format(link))[0]
    li = Link(element=el)
    res = li.html_context()
    assert res == context
Example #4
0
def test_reuse(caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    b1 = Browser(reuse_browser=True, use_virtual_display=VDISPLAY)
    src = 'file://'+testdir+'/umsu.html'
    b1.goto(src)
    b2 = Browser(reuse_browser=True, use_virtual_display=VDISPLAY)
    b2.goto(src)
    assert b1 == b2
    del b1
    del b2
Example #5
0
def scrape(source, keep_tempfiles=False):
    """
    Look for new papers linked to on the source page (and check for
    revisions to known papers).     

    Issues to keep in mind:
    
    (1) Links on personal pages often lead to old papers that have
    been published long ago. (That's true even for newly added links,
    when people upload older papers.) We don't want to list these
    papers in the news feed, nor do we need to check them for
    revisions. So if we find a link to an old and published paper, we
    treat it like a link to a non-paper. (If a manuscript changes into
    a published paper, we keep the paper in the database because it
    still ought to show up as "new papers found on x/y/z" and because
    it might have been used to train personal filters, but we remove
    the doc_id from the link, thereby marking the link as known but
    irrelevant.)

    (2) Sometimes links to papers are temporarily broken, or
    there's a temporary problem with the metadata extraction. So
    if we encounter an error while processing a (promising) new
    link, we try again once or twice in the course of the next
    week (using link.found_date).

    (3) To check for revisions of existing manuscripts (and, more
    unusually, new papers appearing at an old url), we have to
    occasionally re-process known links. But we don't want to re-parse
    all documents all the time. Instead, we select a few old papers
    (i.e., links with an associated doc_id that are still on the page,
    ordered by last_checked).

    (4) We could remove all links from the db that are no longer on
    the page, but it's probably not worth the effort. Moreover, pages
    are sometimes temporarily replaced by "under maintanance" pages
    (for example), and then we may not want to re-process all links
    once the page comes back. So we simply ignore disappeared links:
    they remain in the db, but they are never revisited until they
    reappear on the page.
    
    (5) If a page is processed for the first time (status==0 in the
    db), we don't want to display all linked papers in the news
    feed. Nonetheless, we process all links so that we can check for
    revisions (think of the Stanford Encyclopedia). To avoid
    displaying the papers as new, we mark them with a found_date of
    1970.
    """

    debug(1, '*'*50)
    debug(1, "checking links on %s", source.url)

    # go to page:
    browser = Browser(use_virtual_display=True)
    try:
        browser.goto(source.url)
    except Exception as e:
        debug(1, 'connection to source %s failed: %s', source.url, str(e))
        source.update_db(status=error.code['connection failed'])
        return 0

    if browser.current_url != source.url:
        # redirects of journal pages are OK (e.g. from /current to
        # /nov-2015), but redirects of personal papers pages are often
        # caused by pages having disappeared; the redirect can then
        # take us e.g. to CMU's general document archive; we don't
        # want that. So here we wait for manual approval of the new
        # url, except if the new url is a trivial variant of the old
        # one, e.g. 'https' instead of 'http'.
        if source.sourcetype == 'personal':
            if trivial_url_variant(browser.current_url, source.url):
                source.update_db(url=browser.current_url)
            else:
                debug(1, '%s redirects to %s', source.url, browser.current_url)
                source.update_db(status=301)
                return 0
        else:
            debug(2, '%s redirected to %s', source.url, browser.current_url)

    # extract links:
    source.set_html(browser.page_source)
    source.extract_links(browser)
    
    # Selenium doesn't tell us when a site yields a 404, 401, 500
    # etc. error. But we can usually tell from the fact that there are
    # few known links on the error page:
    debug(1, 'old status {}, old links: {}'.format(source.status, len(source.old_links)))
    if source.status > 0 and len(source.old_links) <= 1:
        debug(1, 'suspiciously few old links, checking status code')
        status, r = util.request_url(source.url)
        if status != 200:
            debug(1, 'error %s at source %s', status, source.url)
            source.update_db(status=status)
            return 0

    source.update_db(status=1)
    
    # process new links:
    if source.new_links:
        for li in source.new_links:
            debug(1, '*** processing new link to %s on %s ***', li.url, source.url)
            process_link(li)
            # for testing: one link only
            # return 1
    else:
        debug(1, "no new links")

    # re-process recently found old links that generated errors:
    for li in source.old_links:
        if li.status > 9:
            tdelta = datetime.now() - li.found_date
            if tdelta.days < 5:
                debug(1, 're-checking recent link %s on %s with status %s', 
                      li.url, source.url, li.status)
                process_link(li, force_reprocess=True)
    
    # re-check old links to papers for revisions:
    #MAX_REVCHECK = 3
    #goodlinks = (li for li in source.old_links if li.doc_id)
    #for li in sorted(goodlinks, key=lambda x:x.last_checked)[:MAX_REVCHECK]:
    #    debug(1, 're-checking old link to paper %s on %s for revisions', li.url, source.url)
    #    process_link(li)

    if not keep_tempfiles:
        remove_tempdir()