def fetch(self, url=None, only_if_modified=True): ''' fetch linked document (or url), returns response object on success, otherwise stores error in db and returns None. ''' time.sleep(1) # be gentle on servers url = url or self.url if only_if_modified and self.last_checked: ims = self.last_checked.strftime('%a, %d %b %Y %H:%M:%S GMT') status,r = util.request_url(url, if_modified_since=ims, etag=self.etag) if (status == 304 or status == 200 and r.headers.get('content-length') == self.filesize): self.update_db() debug(1, "not modified") return None else: status,r = util.request_url(url) if status != 200: self.update_db(status=status) debug(1, "error status %s", status) return None if not r.text: self.update_db(status=error.code['document is empty']) debug(1, 'document is empty') return None self.etag = r.headers.get('etag') self.filesize = r.headers.get('content-length') return r
def test_not_SEP_article(): url = 'http://plato.stanford.edu/index.html' status, r = util.request_url(url) r.encoding = 'utf-8' doc = Doc(url=url, r=r) doc.page = Webpage(url, html=r.text) res = pparser.parse(doc) assert res == False
def find_new_pages(self, name): """searches for papers pages matching author name, returns urls of new pages""" logger.info("\nsearching papers page(s) for %s", name) pages = set() search_terms = [ # careful with google.com: don't block sites.google.com... '-site:academia.edu', '-site:wikipedia.org', '-site:philpapers.org', '-filetype:pdf', '~philosophy', '(publications OR articles OR papers OR "in progress" OR forthcoming)', ] # search full name first, then last name only: search_phrase = '"{}" '.format(name) + ' '.join(search_terms) searchresults = set(googlesearch.search(search_phrase)) search_phrase = '"{}" '.format(name.split()[-1]) + ' '.join(search_terms) searchresults |= set(googlesearch.search(search_phrase)) for url in searchresults: logger.debug("\n") url = util.normalize_url(url) if self.bad_url(url): logger.info("bad url: %s", url) continue # check if url already known: cur = db.cursor() cur.execute("SELECT 1 FROM sources WHERE url = %s", (url,)) rows = cur.fetchall() if rows: logger.info("%s already known", url) continue try: status, r = util.request_url(url) if status != 200: raise Exception('status {}'.format(status)) except: logger.info("cannot retrieve %s", url) else: score = self.evaluate(r, name) if score < 0.7: logger.info("%s doesn't look like a papers page", url) continue dupe = self.is_duplicate(url) if dupe: logger.info("%s is a duplicate of already known %s", url, dupe) continue logger.info("new papers page for %s: %s", name, url) pages.add(url) if not pages: logger.info("no pages found") self.update_author(name) return pages
def test_SEP_ActionPerception(): url = 'http://plato.stanford.edu/entries/action-perception/' status, r = util.request_url(url) r.encoding = 'utf-8' doc = Doc(url=url, r=r) doc.page = Webpage(url, html=r.text) res = pparser.parse(doc) assert res == True assert doc.authors == 'Robert Briscoe, Rick Grush' assert doc.title == 'Action-based Theories of Perception' assert doc.abstract[:10] == 'Action is ' assert doc.abstract[-10:] == 'd of view.' assert 'The tactual ideas' in doc.content assert doc.numwords > 1000
def test_SEP_Abilities(): url = 'http://plato.stanford.edu/entries/abilities/' status, r = util.request_url(url) r.encoding = 'utf-8' doc = Doc(url=url, r=r) doc.page = Webpage(url, html=r.text) res = pparser.parse(doc) assert res == True assert doc.authors == 'John Maier' assert doc.title == 'Abilities' assert doc.abstract[:10] == 'In the acc' assert doc.abstract[-10:] == 'imes true.' assert 'General and specific abilities' in doc.content assert doc.numwords > 1000
def get_authornames(self, journal_url): status,r = util.request_url(journal_url) ms = re.findall(r"<span class='name'>(.+?)</span>", r.text) names = { m for m in ms } return names
def scrape(source, keep_tempfiles=False): """ Look for new papers linked to on the source page (and check for revisions to known papers). Issues to keep in mind: (1) Links on personal pages often lead to old papers that have been published long ago. (That's true even for newly added links, when people upload older papers.) We don't want to list these papers in the news feed, nor do we need to check them for revisions. So if we find a link to an old and published paper, we treat it like a link to a non-paper. (If a manuscript changes into a published paper, we keep the paper in the database because it still ought to show up as "new papers found on x/y/z" and because it might have been used to train personal filters, but we remove the doc_id from the link, thereby marking the link as known but irrelevant.) (2) Sometimes links to papers are temporarily broken, or there's a temporary problem with the metadata extraction. So if we encounter an error while processing a (promising) new link, we try again once or twice in the course of the next week (using link.found_date). (3) To check for revisions of existing manuscripts (and, more unusually, new papers appearing at an old url), we have to occasionally re-process known links. But we don't want to re-parse all documents all the time. Instead, we select a few old papers (i.e., links with an associated doc_id that are still on the page, ordered by last_checked). (4) We could remove all links from the db that are no longer on the page, but it's probably not worth the effort. Moreover, pages are sometimes temporarily replaced by "under maintanance" pages (for example), and then we may not want to re-process all links once the page comes back. So we simply ignore disappeared links: they remain in the db, but they are never revisited until they reappear on the page. (5) If a page is processed for the first time (status==0 in the db), we don't want to display all linked papers in the news feed. Nonetheless, we process all links so that we can check for revisions (think of the Stanford Encyclopedia). To avoid displaying the papers as new, we mark them with a found_date of 1970. """ debug(1, '*'*50) debug(1, "checking links on %s", source.url) # go to page: browser = Browser(use_virtual_display=True) try: browser.goto(source.url) except Exception as e: debug(1, 'connection to source %s failed: %s', source.url, str(e)) source.update_db(status=error.code['connection failed']) return 0 if browser.current_url != source.url: # redirects of journal pages are OK (e.g. from /current to # /nov-2015), but redirects of personal papers pages are often # caused by pages having disappeared; the redirect can then # take us e.g. to CMU's general document archive; we don't # want that. So here we wait for manual approval of the new # url, except if the new url is a trivial variant of the old # one, e.g. 'https' instead of 'http'. if source.sourcetype == 'personal': if trivial_url_variant(browser.current_url, source.url): source.update_db(url=browser.current_url) else: debug(1, '%s redirects to %s', source.url, browser.current_url) source.update_db(status=301) return 0 else: debug(2, '%s redirected to %s', source.url, browser.current_url) # extract links: source.set_html(browser.page_source) source.extract_links(browser) # Selenium doesn't tell us when a site yields a 404, 401, 500 # etc. error. But we can usually tell from the fact that there are # few known links on the error page: debug(1, 'old status {}, old links: {}'.format(source.status, len(source.old_links))) if source.status > 0 and len(source.old_links) <= 1: debug(1, 'suspiciously few old links, checking status code') status, r = util.request_url(source.url) if status != 200: debug(1, 'error %s at source %s', status, source.url) source.update_db(status=status) return 0 source.update_db(status=1) # process new links: if source.new_links: for li in source.new_links: debug(1, '*** processing new link to %s on %s ***', li.url, source.url) process_link(li) # for testing: one link only # return 1 else: debug(1, "no new links") # re-process recently found old links that generated errors: for li in source.old_links: if li.status > 9: tdelta = datetime.now() - li.found_date if tdelta.days < 5: debug(1, 're-checking recent link %s on %s with status %s', li.url, source.url, li.status) process_link(li, force_reprocess=True) # re-check old links to papers for revisions: #MAX_REVCHECK = 3 #goodlinks = (li for li in source.old_links if li.doc_id) #for li in sorted(goodlinks, key=lambda x:x.last_checked)[:MAX_REVCHECK]: # debug(1, 're-checking old link to paper %s on %s for revisions', li.url, source.url) # process_link(li) if not keep_tempfiles: remove_tempdir()
def test_request_url_maxsize(): (status, r) = util.request_url('http://umsu.de/papers/generalising.pdf', maxsize=100000) assert status == 903
def test_request_url_404(): (status, r) = util.request_url('http://umsu.de/notfound/') assert status == 404
def test_request_url(): (status, r) = util.request_url('http://umsu.de/') assert status == 200 assert r.content assert 'Hi' in r.text