def keep_redirecting(r, publisher): # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers # 10.5762/kais.2016.17.5.316 if ("content-length" in r.headers): # manually follow javascript if that's all that's in the payload file_size = int(r.headers["content-length"]) if file_size < 500: matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE) if matches: redirect_url = matches[0] if redirect_url.startswith(u"/"): redirect_url = get_link_target(redirect_url, r.url) return redirect_url # 10.1097/00003643-201406001-00238 if publisher and is_same_publisher(publisher, "Ovid Technologies (Wolters Kluwer Health)"): matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE) if matches: an_number = matches[0] redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(an_number) return redirect_url # handle meta redirects redirect_re = re.compile('<meta[^>]*?url=["\'](.*?)["\']', re.IGNORECASE) redirect_match = redirect_re.findall(r.content_small()) if redirect_match: redirect_path = HTMLParser().unescape(redirect_match[0].strip()) redirect_url = urlparse.urljoin(r.request.url, redirect_path) logger.info(u"redirect_match! redirecting to {}".format(redirect_url)) return redirect_url return None
def gets_a_word_doc(self, link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link.href, base_url) if DEBUG_SCRAPING: logger.info( u"checking to see if {} is a word doc".format(absolute_url)) start = time() try: r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if r.status_code != 200: return False if is_a_word_doc(r): return True except Exception as e: logger.exception(u'error in gets_a_word_doc: {}'.format(e)) return False
def keep_redirecting(r, publisher): # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers # 10.5762/kais.2016.17.5.316 if "content-length" in r.headers: # manually follow javascript if that's all that's in the payload file_size = int(r.headers["content-length"]) if file_size < 500: matches = re.findall(r"<script>location.href='(.*)'</script>", r.text_small(), re.IGNORECASE) if matches: redirect_url = matches[0] if redirect_url.startswith("/"): redirect_url = get_link_target(redirect_url, r.url) return redirect_url # 10.1097/00003643-201406001-00238 if publisher and is_same_publisher( publisher, "Ovid Technologies (Wolters Kluwer Health)"): matches = re.findall(r"OvidAN = '(.*?)';", r.text_small(), re.IGNORECASE) if matches: an_number = matches[0] redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format( an_number) return redirect_url # 10.1097/01.xps.0000491010.82675.1c hostname = urlparse(r.url).hostname if hostname and hostname.endswith('ovid.com'): matches = re.findall(r'var journalURL = "(.*?)";', r.text_small(), re.IGNORECASE) if matches: journal_url = matches[0] logger.info( 'ovid journal match. redirecting to {}'.format(journal_url)) return journal_url # handle meta redirects redirect_re = re.compile('<meta[^>]*http-equiv="?refresh"?[^>]*>', re.IGNORECASE | re.DOTALL) redirect_match = redirect_re.findall(r.text_small()) if redirect_match: redirect = redirect_match[0] logger.info('found a meta refresh element: {}'.format(redirect)) url_re = re.compile('url=["\']?([^">\']*)', re.IGNORECASE | re.DOTALL) url_match = url_re.findall(redirect) if url_match: redirect_path = html.unescape(url_match[0].strip()) redirect_url = urljoin(r.request.url, redirect_path) if not redirect_url.endswith( 'Error/JavaScript.html') and not redirect_url.endswith( '/?reason=expired'): logger.info( "redirect_match! redirecting to {}".format(redirect_url)) return redirect_url return None
def gets_a_pdf(self, link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link.href, base_url) if DEBUG_SCRAPING: logger.info(u"checking to see if {} is a pdf".format(absolute_url)) start = time() try: self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url) return False if self.is_a_pdf_page(): return True except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.Timeout as e: self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in gets_a_pdf" logger.info(self.error) except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except NoDoiException as e: self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except Exception as e: self.error += u"ERROR: Exception error in gets_a_pdf" logger.exception(self.error) if DEBUG_SCRAPING: logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format( elapsed(start), absolute_url)) return False
def keep_redirecting(r, publisher): # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers # 10.5762/kais.2016.17.5.316 if ("content-length" in r.headers): # manually follow javascript if that's all that's in the payload file_size = int(r.headers["content-length"]) if file_size < 500: matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE) if matches: redirect_url = matches[0] if redirect_url.startswith(u"/"): redirect_url = get_link_target(redirect_url, r.url) return redirect_url # 10.1097/00003643-201406001-00238 if publisher and is_same_publisher( publisher, "Ovid Technologies (Wolters Kluwer Health)"): matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE) if matches: an_number = matches[0] redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format( an_number) return redirect_url # handle meta redirects redirect_re = re.compile('<meta[^>]*http-equiv="refresh"[^>]*>', re.IGNORECASE | re.DOTALL) redirect_match = redirect_re.findall(r.content_small()) if redirect_match: redirect = redirect_match[0] logger.info('found a meta refresh element: {}'.format(redirect)) url_re = re.compile('url=["\'](.*?)["\']', re.IGNORECASE | re.DOTALL) url_match = url_re.findall(redirect) if url_match: redirect_path = HTMLParser().unescape(url_match[0].strip()) redirect_url = urlparse.urljoin(r.request.url, redirect_path) logger.info( u"redirect_match! redirecting to {}".format(redirect_url)) return redirect_url return None
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru"] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info(u"not scraping {} because is on our do not scrape list.".format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info(u"is not a PDF for {}. continuing more checks".format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info(u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info(u"found no PDF download link. end of the line. [{}]".format(url)) return self
def scrape_for_fulltext_link(self): landing_url = self.url if DEBUG_SCRAPING: logger.info(u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_landing_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in self.r.url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info(u"landing page is not a PDF for {}. continuing more checks".format(landing_url)) # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error(u'error parsing html, skipped script removal: {}'.format(e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via free pdf)" # now look and see if it is not just free, but open! says_open_url_snippet_patterns = [ ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'), ] for (url_snippet, pattern) in says_open_url_snippet_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if url_snippet in resolved_landing_url.lower() and matches: self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" says_open_access_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in says_open_access_patterns: matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL) if self.is_same_publisher(publisher) and matches: self.scraped_license = "implied-oa" self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" license_patterns = [ ur"(creativecommons.org/licenses/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_license = find_normalized_license(matches[0]) self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via page says license)" if self.is_open: if DEBUG_SCRAPING: logger.info(u"we've decided this is open! took {} seconds [{}]".format( elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format( elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm if page and u"osf-cookie" in unicode(page, "utf-8"): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self): landing_url = self.url if DEBUG_SCRAPING: logger.info( u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format( self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in self.r.url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info( u"landing page is not a PDF for {}. continuing more checks" .format(landing_url)) # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via free pdf)" # now look and see if it is not just free, but open! license_patterns = [ u"(creativecommons.org\/licenses\/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_license = find_normalized_license(matches[0]) self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via page says license)" says_open_url_snippet_patterns = [ ("projecteuclid.org/", u'<strong>Full-text: Open access</strong>'), ] for (url_snippet, pattern) in says_open_url_snippet_patterns: matches = re.findall(pattern, self.r.content_small(), re.IGNORECASE) if url_snippet in self.r.request.url.lower() and matches: self.scraped_open_metadata_url = self.r.request.url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" says_open_access_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in says_open_access_patterns: matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL) if self.is_same_publisher(publisher) and matches: self.scraped_license = "implied-oa" self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" if self.is_open: if DEBUG_SCRAPING: logger.info( u"we've decided this is open! took {} seconds [{}]". format(elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info( u"we've decided this doesn't say open. took {} seconds [{}]" .format(elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def scrape_for_fulltext_link(self, find_pdf_link=True): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) if is_a_word_doc(self.r): if DEBUG_SCRAPING: logger.info( u"this is a word doc. success! [{}]".format(url)) self.scraped_open_metadata_url = url return # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error( u'error parsing html, skipped script removal: {}'.format( e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode( page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: absolute_doc_url = get_link_target(doc_link.href, resolved_url) if DEBUG_SCRAPING: logger.info( u"found a possible .doc download link [{}]".format( absolute_doc_url)) if self.gets_a_word_doc(doc_link, self.r.url): if DEBUG_SCRAPING: logger.info( u"we've decided this is a word doc. [{}]".format( absolute_doc_url)) self.scraped_open_metadata_url = url return else: if DEBUG_SCRAPING: logger.info( u"we've decided this ain't a word doc. [{}]". format(absolute_doc_url)) bhl_link = find_bhl_view_link(resolved_url, page) if bhl_link is not None: logger.info('found a BHL document link: {}'.format( get_link_target(bhl_link.href, resolved_url))) self.scraped_open_metadata_url = url return if _trust_repo_license(resolved_url) and self.scraped_license: logger.info(u'trusting license {}'.format( self.scraped_license)) self.scraped_open_metadata_url = self.url except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self, find_pdf_link=True): landing_url = self.url if DEBUG_SCRAPING: logger.info( u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_landing_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format( self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in resolved_landing_url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info( u"landing page is not a PDF for {}. continuing more checks" .format(landing_url)) # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error( u'error parsing html, skipped script removal: {}'.format( e)) # Look for a pdf link. If we find one, look for a license. pdf_download_link = self.find_pdf_link( page) if find_pdf_link else None if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free pdf)" # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # Look for patterns that indicate availability but not necessarily openness and make this a bronze location. bronze_url_snippet_patterns = [ ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'), ] for (url_snippet, pattern) in bronze_url_snippet_patterns: if url_snippet in resolved_landing_url.lower() and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free article)" bronze_publisher_patterns = [ ("New England Journal of Medicine (NEJM/MMS)", u'<meta content="yes" name="evt-free"'), ("Massachusetts Medical Society", u'<meta content="yes" name="evt-free"'), ] for (publisher, pattern) in bronze_publisher_patterns: if self.is_same_publisher(publisher) and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free article)" # Look for some license-like patterns that make this a hybrid location. hybrid_url_snippet_patterns = [ ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'), ('journals.ametsoc.org/', ur'src="/templates/jsp/_style2/_ams/images/access_free\.gif"' ), ('apsjournals.apsnet.org', ur'src="/products/aps/releasedAssets/images/open-access-icon\.png"' ), ('psychiatriapolska.pl', u'is an Open Access journal:'), ('journals.lww.com', u'<span class="[^>]*ejp-indicator--free'), ] for (url_snippet, pattern) in hybrid_url_snippet_patterns: if url_snippet in resolved_landing_url.lower() and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" hybrid_publisher_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in hybrid_publisher_patterns: if self.is_same_publisher(publisher) and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" # Look for more license-like patterns that make this a hybrid location. # Extract the specific license if present. license_patterns = [ ur"(creativecommons.org/licenses/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_open_metadata_url = landing_url normalized_license = find_normalized_license(matches[0]) self.scraped_license = normalized_license or 'implied-oa' if normalized_license: self.open_version_source_string = 'open (via page says license)' else: self.open_version_source_string = 'open (via page says Open Access)' if self.is_open: if DEBUG_SCRAPING: logger.info( u"we've decided this is open! took {} seconds [{}]". format(elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info( u"we've decided this doesn't say open. took {} seconds [{}]" .format(elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: with closing( http_get(url, stream=True, related_pub=self.related_pub, ask_slowly=self.ask_slowly)) as self.r: if self.r.status_code != 200: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks". format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if u"citeseerx.ist.psu.edu/" in url: matches = re.findall( u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: self.scraped_pdf_url = unicode(matches[0], "utf-8") self.scraped_open_metadata_url = url return pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info( u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self