Ejemplo n.º 1
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if ("content-length" in r.headers):
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith(u"/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(an_number)
            return redirect_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*?url=["\'](.*?)["\']', re.IGNORECASE)
    redirect_match = redirect_re.findall(r.content_small())
    if redirect_match:
        redirect_path = HTMLParser().unescape(redirect_match[0].strip())
        redirect_url = urlparse.urljoin(r.request.url, redirect_path)
        logger.info(u"redirect_match! redirecting to {}".format(redirect_url))
        return redirect_url

    return None
Ejemplo n.º 2
0
def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False
Ejemplo n.º 3
0
def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False
Ejemplo n.º 4
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if "content-length" in r.headers:
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(r"<script>location.href='(.*)'</script>",
                                 r.text_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith("/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(
            publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(r"OvidAN = '(.*?)';", r.text_small(),
                             re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(
                an_number)
            return redirect_url

    # 10.1097/01.xps.0000491010.82675.1c
    hostname = urlparse(r.url).hostname
    if hostname and hostname.endswith('ovid.com'):
        matches = re.findall(r'var journalURL = "(.*?)";', r.text_small(),
                             re.IGNORECASE)
        if matches:
            journal_url = matches[0]
            logger.info(
                'ovid journal match. redirecting to {}'.format(journal_url))
            return journal_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*http-equiv="?refresh"?[^>]*>',
                             re.IGNORECASE | re.DOTALL)
    redirect_match = redirect_re.findall(r.text_small())
    if redirect_match:
        redirect = redirect_match[0]
        logger.info('found a meta refresh element: {}'.format(redirect))
        url_re = re.compile('url=["\']?([^">\']*)', re.IGNORECASE | re.DOTALL)
        url_match = url_re.findall(redirect)

        if url_match:
            redirect_path = html.unescape(url_match[0].strip())
            redirect_url = urljoin(r.request.url, redirect_path)
            if not redirect_url.endswith(
                    'Error/JavaScript.html') and not redirect_url.endswith(
                        '/?reason=expired'):
                logger.info(
                    "redirect_match! redirecting to {}".format(redirect_url))
                return redirect_url

    return None
Ejemplo n.º 5
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if ("content-length" in r.headers):
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(ur"<script>location.href='(.*)'</script>",
                                 r.content_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith(u"/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(
            publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(),
                             re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(
                an_number)
            return redirect_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*http-equiv="refresh"[^>]*>',
                             re.IGNORECASE | re.DOTALL)
    redirect_match = redirect_re.findall(r.content_small())
    if redirect_match:
        redirect = redirect_match[0]
        logger.info('found a meta refresh element: {}'.format(redirect))
        url_re = re.compile('url=["\'](.*?)["\']', re.IGNORECASE | re.DOTALL)
        url_match = url_re.findall(redirect)
        if url_match:
            redirect_path = HTMLParser().unescape(url_match[0].strip())
            redirect_url = urlparse.urljoin(r.request.url, redirect_path)
            logger.info(
                u"redirect_match! redirecting to {}".format(redirect_url))
            return redirect_url

    return None
Ejemplo n.º 6
0
 def is_same_publisher(self, publisher):
     return is_same_publisher(self.related_pub_publisher, publisher)
Ejemplo n.º 7
0
 def is_same_publisher(self, publisher):
     return is_same_publisher(self.related_pub_publisher, publisher)