Example #1
0
 def fetch(self, url=None, only_if_modified=True):
     '''
     fetch linked document (or url), returns response object on
     success, otherwise stores error in db and returns None.
     '''
     time.sleep(1) # be gentle on servers
     url = url or self.url
     if only_if_modified and self.last_checked:
         ims = self.last_checked.strftime('%a, %d %b %Y %H:%M:%S GMT')
         status,r = util.request_url(url, if_modified_since=ims, etag=self.etag)
         if (status == 304 or
             status == 200 and r.headers.get('content-length') == self.filesize):
             self.update_db()
             debug(1, "not modified")
             return None
     else:
         status,r = util.request_url(url)
     if status != 200:
         self.update_db(status=status)
         debug(1, "error status %s", status)
         return None
     if not r.text:
         self.update_db(status=error.code['document is empty'])
         debug(1, 'document is empty')
         return None
     self.etag = r.headers.get('etag')
     self.filesize = r.headers.get('content-length')
     return r
Example #2
0
def test_not_SEP_article():
    url = 'http://plato.stanford.edu/index.html'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == False
Example #3
0
 def find_new_pages(self, name):
     """searches for papers pages matching author name, returns urls of new pages"""
     logger.info("\nsearching papers page(s) for %s", name)
     pages = set()
     search_terms = [
         # careful with google.com: don't block sites.google.com...
         '-site:academia.edu',
         '-site:wikipedia.org',
         '-site:philpapers.org',
         '-filetype:pdf',
         '~philosophy',
         '(publications OR articles OR papers OR "in progress" OR forthcoming)',
     ]
     # search full name first, then last name only:
     search_phrase = '"{}" '.format(name) + ' '.join(search_terms)
     searchresults = set(googlesearch.search(search_phrase))
     search_phrase = '"{}" '.format(name.split()[-1]) + ' '.join(search_terms)
     searchresults |= set(googlesearch.search(search_phrase))
     for url in searchresults:
         logger.debug("\n")
         url = util.normalize_url(url) 
         if self.bad_url(url):
             logger.info("bad url: %s", url)
             continue
         # check if url already known:
         cur = db.cursor()
         cur.execute("SELECT 1 FROM sources WHERE url = %s", (url,))
         rows = cur.fetchall()
         if rows:
             logger.info("%s already known", url)
             continue
         try:
             status, r = util.request_url(url)
             if status != 200:
                 raise Exception('status {}'.format(status))
         except:
             logger.info("cannot retrieve %s", url)
         else:
             score = self.evaluate(r, name)
             if score < 0.7:
                 logger.info("%s doesn't look like a papers page", url)
                 continue
             dupe = self.is_duplicate(url)
             if dupe:
                 logger.info("%s is a duplicate of already known %s", url, dupe)
                 continue
             logger.info("new papers page for %s: %s", name, url)                
             pages.add(url)
     if not pages:
         logger.info("no pages found")
     self.update_author(name)
     return pages
Example #4
0
def test_SEP_ActionPerception():
    url = 'http://plato.stanford.edu/entries/action-perception/'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == True
    assert doc.authors == 'Robert Briscoe, Rick Grush'
    assert doc.title == 'Action-based Theories of Perception'
    assert doc.abstract[:10] == 'Action is '
    assert doc.abstract[-10:] == 'd of view.'
    assert 'The tactual ideas' in doc.content
    assert doc.numwords > 1000
Example #5
0
def test_SEP_Abilities():
    url = 'http://plato.stanford.edu/entries/abilities/'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == True
    assert doc.authors == 'John Maier'
    assert doc.title == 'Abilities'
    assert doc.abstract[:10] == 'In the acc'
    assert doc.abstract[-10:] == 'imes true.'
    assert 'General and specific abilities' in doc.content
    assert doc.numwords > 1000
Example #6
0
 def get_authornames(self, journal_url):
     status,r = util.request_url(journal_url)
     ms = re.findall(r"<span class='name'>(.+?)</span>", r.text)
     names = { m for m in ms }
     return names
Example #7
0
def scrape(source, keep_tempfiles=False):
    """
    Look for new papers linked to on the source page (and check for
    revisions to known papers).     

    Issues to keep in mind:
    
    (1) Links on personal pages often lead to old papers that have
    been published long ago. (That's true even for newly added links,
    when people upload older papers.) We don't want to list these
    papers in the news feed, nor do we need to check them for
    revisions. So if we find a link to an old and published paper, we
    treat it like a link to a non-paper. (If a manuscript changes into
    a published paper, we keep the paper in the database because it
    still ought to show up as "new papers found on x/y/z" and because
    it might have been used to train personal filters, but we remove
    the doc_id from the link, thereby marking the link as known but
    irrelevant.)

    (2) Sometimes links to papers are temporarily broken, or
    there's a temporary problem with the metadata extraction. So
    if we encounter an error while processing a (promising) new
    link, we try again once or twice in the course of the next
    week (using link.found_date).

    (3) To check for revisions of existing manuscripts (and, more
    unusually, new papers appearing at an old url), we have to
    occasionally re-process known links. But we don't want to re-parse
    all documents all the time. Instead, we select a few old papers
    (i.e., links with an associated doc_id that are still on the page,
    ordered by last_checked).

    (4) We could remove all links from the db that are no longer on
    the page, but it's probably not worth the effort. Moreover, pages
    are sometimes temporarily replaced by "under maintanance" pages
    (for example), and then we may not want to re-process all links
    once the page comes back. So we simply ignore disappeared links:
    they remain in the db, but they are never revisited until they
    reappear on the page.
    
    (5) If a page is processed for the first time (status==0 in the
    db), we don't want to display all linked papers in the news
    feed. Nonetheless, we process all links so that we can check for
    revisions (think of the Stanford Encyclopedia). To avoid
    displaying the papers as new, we mark them with a found_date of
    1970.
    """

    debug(1, '*'*50)
    debug(1, "checking links on %s", source.url)

    # go to page:
    browser = Browser(use_virtual_display=True)
    try:
        browser.goto(source.url)
    except Exception as e:
        debug(1, 'connection to source %s failed: %s', source.url, str(e))
        source.update_db(status=error.code['connection failed'])
        return 0

    if browser.current_url != source.url:
        # redirects of journal pages are OK (e.g. from /current to
        # /nov-2015), but redirects of personal papers pages are often
        # caused by pages having disappeared; the redirect can then
        # take us e.g. to CMU's general document archive; we don't
        # want that. So here we wait for manual approval of the new
        # url, except if the new url is a trivial variant of the old
        # one, e.g. 'https' instead of 'http'.
        if source.sourcetype == 'personal':
            if trivial_url_variant(browser.current_url, source.url):
                source.update_db(url=browser.current_url)
            else:
                debug(1, '%s redirects to %s', source.url, browser.current_url)
                source.update_db(status=301)
                return 0
        else:
            debug(2, '%s redirected to %s', source.url, browser.current_url)

    # extract links:
    source.set_html(browser.page_source)
    source.extract_links(browser)
    
    # Selenium doesn't tell us when a site yields a 404, 401, 500
    # etc. error. But we can usually tell from the fact that there are
    # few known links on the error page:
    debug(1, 'old status {}, old links: {}'.format(source.status, len(source.old_links)))
    if source.status > 0 and len(source.old_links) <= 1:
        debug(1, 'suspiciously few old links, checking status code')
        status, r = util.request_url(source.url)
        if status != 200:
            debug(1, 'error %s at source %s', status, source.url)
            source.update_db(status=status)
            return 0

    source.update_db(status=1)
    
    # process new links:
    if source.new_links:
        for li in source.new_links:
            debug(1, '*** processing new link to %s on %s ***', li.url, source.url)
            process_link(li)
            # for testing: one link only
            # return 1
    else:
        debug(1, "no new links")

    # re-process recently found old links that generated errors:
    for li in source.old_links:
        if li.status > 9:
            tdelta = datetime.now() - li.found_date
            if tdelta.days < 5:
                debug(1, 're-checking recent link %s on %s with status %s', 
                      li.url, source.url, li.status)
                process_link(li, force_reprocess=True)
    
    # re-check old links to papers for revisions:
    #MAX_REVCHECK = 3
    #goodlinks = (li for li in source.old_links if li.doc_id)
    #for li in sorted(goodlinks, key=lambda x:x.last_checked)[:MAX_REVCHECK]:
    #    debug(1, 're-checking old link to paper %s on %s for revisions', li.url, source.url)
    #    process_link(li)

    if not keep_tempfiles:
        remove_tempdir()
Example #8
0
def test_request_url_maxsize():
    (status, r) = util.request_url('http://umsu.de/papers/generalising.pdf', maxsize=100000)
    assert status == 903
Example #9
0
def test_request_url_404():
    (status, r) = util.request_url('http://umsu.de/notfound/')
    assert status == 404
Example #10
0
def test_request_url():
    (status, r) = util.request_url('http://umsu.de/')
    assert status == 200
    assert r.content
    assert 'Hi' in r.text