Exemple #1
0
    def set_local_lookup_oa(self):
        start_time = time()

        evidence = None
        fulltext_url = self.url

        license = "unknown"
        if oa_local.is_open_via_doaj_issn(self.issns):
            license = oa_local.is_open_via_doaj_issn(self.issns)
            evidence = "oa journal (via issn in doaj)"
        elif oa_local.is_open_via_doaj_journal(self.journal):
            license = oa_local.is_open_via_doaj_journal(self.journal)
            evidence = "oa journal (via journal title in doaj)"
        elif oa_local.is_open_via_datacite_prefix(self.doi):
            evidence = "oa repository (via datacite prefix)"
        elif oa_local.is_open_via_doi_fragment(self.doi):
            evidence = "oa repository (via doi prefix)"
        elif oa_local.is_open_via_url_fragment(self.url):
            evidence = "oa repository (via url prefix)"
        elif oa_local.is_open_via_license_urls(self.crossref_license_urls):
            freetext_license = oa_local.is_open_via_license_urls(self.crossref_license_urls)
            license = oa_local.find_normalized_license(freetext_license)
            evidence = "hybrid journal (via crossref license url)"  # oa_color depends on this including the word "hybrid"

        if evidence:
            self.fulltext_url = fulltext_url
            self.evidence = evidence
            self.license = license
        if self.fulltext_url and self.license and self.license != "unknown":
            self.response_done = True
Exemple #2
0
    def ask_local_lookup(self):
        start_time = time()

        evidence = None
        fulltext_url = self.url

        license = "unknown"
        if oa_local.is_open_via_doaj_issn(self.issns):
            license = oa_local.is_open_via_doaj_issn(self.issns)
            evidence = "oa journal (via issn in doaj)"
        elif oa_local.is_open_via_doaj_journal(self.journal):
            license = oa_local.is_open_via_doaj_journal(self.journal)
            evidence = "oa journal (via journal title in doaj)"
        elif oa_local.is_open_via_datacite_prefix(self.doi):
            evidence = "oa repository (via datacite prefix)"
        elif oa_local.is_open_via_doi_fragment(self.doi):
            evidence = "oa repository (via doi prefix)"
        elif oa_local.is_open_via_url_fragment(self.url):
            evidence = "oa repository (via url prefix)"
        elif oa_local.is_open_via_license_urls(self.crossref_license_urls):
            freetext_license = oa_local.is_open_via_license_urls(self.crossref_license_urls)
            license = oa_local.find_normalized_license(freetext_license)
            evidence = "hybrid journal (via crossref license url)"  # oa_color depends on this including the word "hybrid"

        if evidence:
            my_version = OpenVersion()
            my_version.metadata_url = fulltext_url
            my_version.license = license
            my_version.source = evidence
            my_version.doi = self.doi
            self.open_versions.append(my_version)
Exemple #3
0
    def update_with_local_info(self):
        scrape_version_old = self.scrape_version
        scrape_license_old = self.scrape_license

        # if this repo has told us they will never have submitted, set default to be accepted
        if self.endpoint and self.endpoint.policy_promises_no_submitted and self.scrape_version != "publishedVersion":
            self.scrape_version = "acceptedVersion"

        # now look at the pmh record
        if self.pmh_record:
            # trust accepted in a variety of formats
            accepted_patterns = [
                re.compile(ur"accepted.?version",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"version.?accepted",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"accepted.?manuscript",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<dc:type>peer.?reviewed</dc:type>",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL)
            ]
            for pattern in accepted_patterns:
                if pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "acceptedVersion"
            # print u"version for is {}".format(self.scrape_version)

            # trust a strict version of published version
            published_patterns = [
                re.compile(ur"<dc:type>.*publishedVersion</dc:type>",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<free_to_read>.*published.*</free_to_read>",
                           re.IGNORECASE | re.MULTILINE | re.DOTALL)
            ]
            for published_pattern in published_patterns:
                if published_pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "publishedVersion"

            # get license if it is in pmh record
            rights_pattern = re.compile(
                ur"<dc:rights>(.*)</dc:rights>",
                re.IGNORECASE | re.MULTILINE | re.DOTALL)
            rights_matches = rights_pattern.findall(self.pmh_record.api_raw)
            for rights_text in rights_matches:
                open_license = find_normalized_license(rights_text)
                # only overwrite it if there is one, so doesn't overwrite anything scraped
                if open_license:
                    self.scrape_license = open_license

            self.scrape_version = _scrape_version_override().get(
                self.pmh_record.pmh_id, self.scrape_version)

        if scrape_version_old != self.scrape_version or scrape_license_old != self.scrape_license:
            self.updated = datetime.datetime.utcnow().isoformat()
            print u"based on OAI-PMH metadata, updated {} {} for {} {}".format(
                self.scrape_version, self.scrape_license, self.url, self.id)
            return True

        # print u"based on metadata, assuming {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id)

        return False
Exemple #4
0
    def set_info_for_pmc_page(self):
        if not self.pmcid:
            return

        result_list = query_pmc(self.pmcid)
        if not result_list:
            return
        result = result_list[0]
        has_pdf = result.get("hasPDF", None)
        is_author_manuscript = result.get("authMan", None)
        is_open_access = result.get("isOpenAccess", None)
        raw_license = result.get("license", None)

        self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(
            self.pmcid)
        if has_pdf == u"Y":
            self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(
                self.pmcid)
        if is_author_manuscript == u"Y":
            self.scrape_version = u"acceptedVersion"
        else:
            self.scrape_version = u"publishedVersion"
        if raw_license:
            self.scrape_license = find_normalized_license(raw_license)
        elif is_open_access == "Y":
            self.scrape_license = u"implied-oa"
Exemple #5
0
    def set_info_for_pmc_page(self):
        if not self.pmcid:
            return

        url_template = u"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resulttype=core&format=json&tool=oadoi"
        url = url_template.format(self.pmcid)

        # try:
        r = http_get(url)
        data = r.json()
        result_list = data["resultList"]["result"]
        if not result_list:
            return
        result = result_list[0]
        has_pdf = result.get("hasPDF", None)
        is_author_manuscript = result.get("authMan", None)
        is_open_access = result.get("isOpenAccess", None)
        raw_license = result.get("license", None)

        self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(self.pmcid)
        if has_pdf == u"Y":
            self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(self.pmcid)
        if is_author_manuscript == u"Y":
            self.scrape_version = u"acceptedVersion"
        else:
            self.scrape_version = u"publishedVersion"
        if raw_license:
            self.scrape_license = find_normalized_license(raw_license)
        elif is_open_access == "Y":
            self.scrape_license = u"implied-oa"
Exemple #6
0
    def set_info_for_pmc_page(self):
        if not self.pmcid:
            return

        url_template = u"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resulttype=core&format=json&tool=oadoi"
        url = url_template.format(self.pmcid)

        # try:
        r = http_get(url)
        data = r.json()
        result_list = data["resultList"]["result"]
        if not result_list:
            return
        result = result_list[0]
        has_pdf = result.get("hasPDF", None)
        is_author_manuscript = result.get("authMan", None)
        is_open_access = result.get("isOpenAccess", None)
        raw_license = result.get("license", None)

        self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(
            self.pmcid)
        if has_pdf == u"Y":
            self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(
                self.pmcid)
        if is_author_manuscript == u"Y":
            self.scrape_version = u"acceptedVersion"
        else:
            self.scrape_version = u"publishedVersion"
        if raw_license:
            self.scrape_license = find_normalized_license(raw_license)
        elif is_open_access == "Y":
            self.scrape_license = u"implied-oa"
Exemple #7
0
    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself

        if not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)

            # logger.info(text)

            if text and self.scrape_version == "submittedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL)
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
Exemple #8
0
    def set_version_and_license(self, r=None):

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        if not r:
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(),
                          re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r)
            # logger.info(text)
            if text and self.scrape_version == "submittedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright \d{4}", re.IGNORECASE),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(
                        ur"This article is distributed under the terms of the Creative Commons",
                        re.IGNORECASE),
                    re.compile(ur"this is an open access article",
                               re.IGNORECASE)
                ]

                for pattern in patterns:
                    matches = pattern.findall(text)
                    if matches:
                        self.scrape_version = "publishedVersion"

            logger.info(u"returning {} with scrape_version: {}".format(
                self.url, self.scrape_version))

            open_license = find_normalized_license(text)
            if open_license:
                self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(
                self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)
            pass
Exemple #9
0
    def update_with_local_info(self):
        scrape_version_old = self.scrape_version
        scrape_license_old = self.scrape_license

        # if this repo has told us they will never have submitted, set default to be accepted
        if self.endpoint and self.endpoint.policy_promises_no_submitted and self.scrape_version != "publishedVersion":
            self.scrape_version = "acceptedVersion"

        # now look at the pmh record
        if self.pmh_record:
            # trust accepted in a variety of formats
            accepted_patterns = [
                re.compile(ur"accepted.?version", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"version.?accepted", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"accepted.?manuscript", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<dc:type>peer.?reviewed</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
                ]
            for pattern in accepted_patterns:
                if pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "acceptedVersion"
            # print u"version for is {}".format(self.scrape_version)

            # trust a strict version of published version
            published_patterns = [
                re.compile(ur"<dc:type>.*publishedVersion</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<dc:type\.version>.*publishedVersion</dc:type\.version>", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<free_to_read>.*published.*</free_to_read>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
            ]
            for published_pattern in published_patterns:
                if published_pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "publishedVersion"

            # get license if it is in pmh record
            rights_pattern = re.compile(ur"<dc:rights>(.*)</dc:rights>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
            rights_matches = rights_pattern.findall(self.pmh_record.api_raw)
            rights_license_pattern = re.compile(ur"<dc:rights\.license>(.*)</dc:rights\.license>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
            rights_matches.extend(rights_license_pattern.findall(self.pmh_record.api_raw))

            for rights_text in rights_matches:
                open_license = find_normalized_license(rights_text)
                # only overwrite it if there is one, so doesn't overwrite anything scraped
                if open_license:
                    self.scrape_license = open_license

            self.scrape_version = _scrape_version_override().get(self.pmh_record.pmh_id, self.scrape_version)

        if self.scrape_pdf_url and re.search(ur'^https?://rke\.abertay\.ac\.uk', self.scrape_pdf_url):
            if re.search(ur'Publishe[dr]_?\d\d\d\d\.pdf$', self.scrape_pdf_url):
                self.scrape_version = "publishedVersion"
Exemple #10
0
    def update_with_local_info(self):
        scrape_version_old = self.scrape_version
        scrape_license_old = self.scrape_license

        # if this repo has told us they will never have submitted, set default to be accepted
        if self.endpoint and self.endpoint.policy_promises_no_submitted:
            self.scrape_version = "acceptedVersion"

        # now look at the pmh record
        if self.pmh_record:
            # trust accepted in a variety of formats
            accepted_patterns = [
                re.compile(ur"accepted.?version", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"version.?accepted", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"accepted.?manuscript", re.IGNORECASE | re.MULTILINE | re.DOTALL)
                ]
            for pattern in accepted_patterns:
                if pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "acceptedVersion"
            # print u"version for is {}".format(self.scrape_version)

            # trust a strict version of published version
            published_patterns = [
                re.compile(ur"<dc:type>.*publishedVersion</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                re.compile(ur"<free_to_read>.*published.*</free_to_read>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
            ]
            for published_pattern in published_patterns:
                if published_pattern.findall(self.pmh_record.api_raw):
                    self.scrape_version = "publishedVersion"

            # get license if it is in pmh record
            rights_pattern = re.compile(ur"<dc:rights>(.*)</dc:rights>", re.IGNORECASE | re.MULTILINE | re.DOTALL)
            rights_matches = rights_pattern.findall(self.pmh_record.api_raw)
            for rights_text in rights_matches:
                open_license = find_normalized_license(rights_text)
                # only overwrite it if there is one, so doesn't overwrite anything scraped
                if open_license:
                    self.scrape_license = open_license

        if scrape_version_old != self.scrape_version or scrape_license_old != self.scrape_license:
            self.updated = datetime.datetime.utcnow().isoformat()
            print u"based on OAI-PMH metadata, updated {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id)
            return True

        # print u"based on metadata, assuming {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id)

        return False
Exemple #11
0
    def set_version_and_license(self, r=None):

        # set as default
        self.scrape_version = "submittedVersion"

        if self.is_pmc:
            print "implement PMC version properly"
            print 1 / 0
            # todo

        if r:
            try:
                text = convert_pdf_to_txt(r)
                # logger.info(text)
                if text:
                    patterns = [
                        re.compile(ur"©.?\d{4}", re.UNICODE),
                        re.compile(ur"copyright \d{4}", re.IGNORECASE),
                        re.compile(ur"all rights reserved", re.IGNORECASE),
                        re.compile(
                            ur"This article is distributed under the terms of the Creative Commons",
                            re.IGNORECASE),
                        re.compile(ur"this is an open access article",
                                   re.IGNORECASE)
                    ]

                    for pattern in patterns:
                        matches = pattern.findall(text)
                        if matches:
                            self.scrape_version = "publishedVersion"

                    logger.info(u"returning with scrape_version={}".format(
                        self.scrape_version))

                    open_license = find_normalized_license(text)
                    if open_license:
                        self.scrape_license = open_license

            except Exception as e:
                self.error += u"Exception doing convert_pdf_to_txt on {}! investigate! {}".format(
                    self.scrape_pdf_url,
                    unicode(e.message).encode("utf-8"))
                logger.info(self.error)
                pass
Exemple #12
0
    def set_fulltext_urls(self):

        # first set license if there is one originally.  overwrite it later if scraped a better one.
        if "license" in self.doc and self.doc["license"]:
            self.license = oa_local.find_normalized_license(
                self.doc["license"])

        for my_webpage in self.open_webpages:
            if my_webpage.has_fulltext_url:
                response = {}
                self.fulltext_url_dicts += [{
                    "free_pdf_url": my_webpage.scraped_pdf_url,
                    "pdf_landing_page": my_webpage.url
                }]
                if not self.license or self.license == "unknown":
                    self.license = my_webpage.scraped_license
            else:
                print "{} has no fulltext url alas".format(my_webpage)

        if self.license == "unknown":
            self.license = None
Exemple #13
0
    def ask_local_lookup(self):
        start_time = time()

        evidence = None
        fulltext_url = self.url

        license = None

        if oa_local.is_open_via_doaj_issn(self.issns, self.year):
            license = oa_local.is_open_via_doaj_issn(self.issns, self.year)
            evidence = "oa journal (via issn in doaj)"
        elif not self.issns and oa_local.is_open_via_doaj_journal(
                self.all_journals, self.year):
            license = oa_local.is_open_via_doaj_journal(
                self.all_journals, self.year)
            evidence = "oa journal (via journal title in doaj)"
        elif oa_local.is_open_via_publisher(self.publisher):
            evidence = "oa journal (via publisher name)"
        elif oa_local.is_open_via_doi_fragment(self.doi):
            evidence = "oa repository (via doi prefix)"
        elif oa_local.is_open_via_url_fragment(self.url):
            evidence = "oa repository (via url prefix)"
        elif oa_local.is_open_via_license_urls(self.crossref_license_urls):
            freetext_license = oa_local.is_open_via_license_urls(
                self.crossref_license_urls)
            license = oa_local.find_normalized_license(freetext_license)
            # logger.info(u"freetext_license: {} {}".format(freetext_license, license))
            evidence = "open (via crossref license)"  # oa_color depends on this including the word "hybrid"

        if evidence:
            my_location = OpenLocation()
            my_location.metadata_url = fulltext_url
            my_location.license = license
            my_location.evidence = evidence
            my_location.updated = datetime.datetime.utcnow()
            my_location.doi = self.doi
            my_location.version = "publishedVersion"
            self.open_locations.append(my_location)
Exemple #14
0
    def set_info_for_pmc_page(self):
        if not self.pmcid:
            return

        result_list = query_pmc(self.pmcid)
        if not result_list:
            return
        result = result_list[0]
        has_pdf = result.get("hasPDF", None)
        is_author_manuscript = result.get("authMan", None)
        is_open_access = result.get("isOpenAccess", None)
        raw_license = result.get("license", None)

        self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(self.pmcid)
        if has_pdf == u"Y":
            self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(self.pmcid)
        if is_author_manuscript == u"Y":
            self.scrape_version = u"acceptedVersion"
        else:
            self.scrape_version = u"publishedVersion"
        if raw_license:
            self.scrape_license = find_normalized_license(raw_license)
        elif is_open_access == "Y":
            self.scrape_license = u"implied-oa"
Exemple #15
0
def get_fulltext_webpages_from_our_base_doc(doc):
    response = []

    license = doc.get("fulltext_license", None)

    # workaround for a bug there was in the normalized license
    license_string_in_doc = doc.get("license", "")
    if license_string_in_doc:
        if "orks not in the public domain" in license_string_in_doc:
            license = None
        if not license:
            license = find_normalized_license(license_string_in_doc)

    if "fulltext_url_dicts" in doc:
        for scrape_results in doc["fulltext_url_dicts"]:
            my_webpage = WebpageInOpenRepo(url=scrape_results.get("pdf_landing_page", None))
            my_webpage.scraped_pdf_url = scrape_results.get("free_pdf_url", None)
            my_webpage.scraped_open_metadata_url = scrape_results.get("pdf_landing_page", None)
            my_webpage.scraped_license = license
            response.append(my_webpage)

    # eventually these will have fulltext_url_dicts populated as well
    if doc["oa"] == 1:
        for url in get_urls_from_our_base_doc(doc):
            my_webpage = WebpageInOpenRepo(url=url)
            my_webpage.scraped_open_metadata_url = url

            # this will get handled when the oa1 urls get added
            pmcid_matches = re.findall(".*(PMC\d+).*", url)
            if pmcid_matches:
                pmcid = pmcid_matches[0]
                my_webpage.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format(pmcid)

            my_webpage.scraped_license = license
            response.append(my_webpage)
    return response
Exemple #16
0
    def scrape_for_fulltext_link(self, find_pdf_link=True):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)
            resolved_landing_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(
                        self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in resolved_landing_url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"landing page is not a PDF for {}.  continuing more checks"
                        .format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(
                    u'error parsing html, skipped script removal: {}'.format(
                        e))

            # Look for a pdf link. If we find one, look for a license.

            pdf_download_link = self.find_pdf_link(
                page) if find_pdf_link else None

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free pdf)"

                    # set the license if we can find one
                    scraped_license = find_normalized_license(page)
                    if scraped_license:
                        self.scraped_license = scraped_license

            # Look for patterns that indicate availability but not necessarily openness and make this a bronze location.

            bronze_url_snippet_patterns = [
                ('sciencedirect.com/',
                 u'<div class="OpenAccessLabel">open archive</div>'),
            ]

            for (url_snippet, pattern) in bronze_url_snippet_patterns:
                if url_snippet in resolved_landing_url.lower() and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free article)"

            bronze_publisher_patterns = [
                ("New England Journal of Medicine (NEJM/MMS)",
                 u'<meta content="yes" name="evt-free"'),
                ("Massachusetts Medical Society",
                 u'<meta content="yes" name="evt-free"'),
            ]

            for (publisher, pattern) in bronze_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free article)"

            # Look for some license-like patterns that make this a hybrid location.

            hybrid_url_snippet_patterns = [
                ('projecteuclid.org/',
                 u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/',
                 u'<div class="OpenAccessLabel">open access</div>'),
                ('journals.ametsoc.org/',
                 ur'src="/templates/jsp/_style2/_ams/images/access_free\.gif"'
                 ),
                ('apsjournals.apsnet.org',
                 ur'src="/products/aps/releasedAssets/images/open-access-icon\.png"'
                 ),
                ('psychiatriapolska.pl', u'is an Open Access journal:'),
                ('journals.lww.com', u'<span class="[^>]*ejp-indicator--free'),
            ]

            for (url_snippet, pattern) in hybrid_url_snippet_patterns:
                if url_snippet in resolved_landing_url.lower() and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            hybrid_publisher_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)",
                 u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)",
                 u'<span class="icon access open-access cursorDefault">'),
            ]

            for (publisher, pattern) in hybrid_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            # Look for more license-like patterns that make this a hybrid location.
            # Extract the specific license if present.

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]

            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_open_metadata_url = landing_url
                    normalized_license = find_normalized_license(matches[0])
                    self.scraped_license = normalized_license or 'implied-oa'
                    if normalized_license:
                        self.open_version_source_string = 'open (via page says license)'
                    else:
                        self.open_version_source_string = 'open (via page says Open Access)'

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this is open! took {} seconds [{}]".
                        format(elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this doesn't say open. took {} seconds [{}]"
                        .format(elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
Exemple #17
0
    def set_version_and_license(self, r=None):

        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself

        if not r:
            logger.info(
                u"before scrape returning {} with scrape_version: {}, license {}"
                .format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(),
                          re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)

            # logger.info(text)

            if text and self.scrape_version == "submittedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(
                        ur"received.{0,100}revised.{0,100}accepted.{0,100}publication",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(
                        ur"This article is distributed under the terms of the Creative Commons",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(
                        ur"This article is licensed under a Creative Commons",
                        re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article",
                               re.IGNORECASE | re.MULTILINE | re.DOTALL)
                ]

                for pattern in patterns:
                    if pattern.findall(text):
                        self.scrape_version = "publishedVersion"

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(
                self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)
            pass

        logger.info(
            u"scrape returning {} with scrape_version: {}, license {}".format(
                self.url, self.scrape_version, self.scrape_license))
Exemple #18
0
    def scrape_green(self):
        # handle these special cases, where we compute the pdf rather than looking for it
        if "oai:arXiv.org" in self.pmh_id:
            self.scrape_metadata_url = self.url
            self.scrape_pdf_url = self.url.replace("abs", "pdf")

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # https://ink.library.smu.edu.sg/do/oai/
        if self.endpoint and self.endpoint.id == 'ys9xnlw27yogrfsecedx' and u'ink.library.smu.edu.sg' in self.url:
            if u'viewcontent.cgi?' in self.url:
                return
            if self.pmh_record and find_normalized_license(self.pmh_record.license):
                self.scrape_metadata_url = self.url
                self.set_version_and_license()
                return

        if not self.scrape_pdf_url or not self.scrape_version:
            with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage:
                if not self.scrape_pdf_url:
                    my_webpage.scrape_for_fulltext_link()
                    self.error += my_webpage.error
                    if my_webpage.is_open:
                        logger.info(u"** found an open copy! {}".format(my_webpage.fulltext_url))
                        self.scrape_updated = datetime.datetime.utcnow().isoformat()
                        self.scrape_metadata_url = self.url
                        if my_webpage.scraped_pdf_url:
                            self.scrape_pdf_url = my_webpage.scraped_pdf_url
                        if my_webpage.scraped_open_metadata_url:
                            self.scrape_metadata_url = my_webpage.scraped_open_metadata_url
                        if my_webpage.scraped_license:
                            self.scrape_license = my_webpage.scraped_license
                        if my_webpage.scraped_version:
                            self.scrape_version = my_webpage.scraped_version
                if self.scrape_pdf_url and not self.scrape_version:
                    self.set_version_and_license(r=my_webpage.r)

        if self.scrape_pdf_url and not self.scrape_version:
            with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage:
                my_webpage.set_r_for_pdf()
                self.set_version_and_license(r=my_webpage.r)

        if self.is_open and not self.scrape_version:
            self.scrape_version = self.default_version()

        # associate certain landing page URLs with PDFs
        # https://repository.uantwerpen.be
        if self.endpoint and self.endpoint.id == 'mmv3envg3kaaztya9tmo':
            if self.scrape_pdf_url and self.scrape_pdf_url == self.scrape_metadata_url and self.pmh_record:
                logger.info(u'looking for landing page for {}'.format(self.scrape_pdf_url))
                landing_urls = [u for u in self.pmh_record.urls if u'hdl.handle.net' in u]
                if len(landing_urls) == 1:
                    logger.info(u'trying landing page {}'.format(landing_urls[0]))

                    try:
                        if http_get(landing_urls[0]).status_code == 200:
                            self.scrape_metadata_url = landing_urls[0]
                    except:
                        pass

                    if self.scrape_metadata_url:
                        logger.info(u'set landing page {}'.format(self.scrape_metadata_url))

        # https://lirias.kuleuven.be
        if (self.endpoint
            and self.endpoint.id == 'ycf3gzxeiyuw3jqwjmx3'
            and self.scrape_pdf_url == self.scrape_metadata_url
            and self.scrape_pdf_url and 'lirias.kuleuven.be' in self.scrape_pdf_url
        ):
            if self.pmh_record and self.pmh_record.bare_pmh_id and 'oai:lirias2repo.kuleuven.be:' in self.pmh_record.bare_pmh_id:
                self.scrape_metadata_url = 'https://lirias.kuleuven.be/handle/{}'.format(
                    self.pmh_record.bare_pmh_id.replace('oai:lirias2repo.kuleuven.be:', '')
                )
Exemple #19
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            if page and u"osf-cookie" in unicode(page, "utf-8"):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Exemple #20
0
    def scrape_for_fulltext_link(self):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(
                        self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.r.url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"landing page is not a PDF for {}.  continuing more checks"
                        .format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via free pdf)"

            # now look and see if it is not just free, but open!
            license_patterns = [
                u"(creativecommons.org\/licenses\/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]
            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_license = find_normalized_license(matches[0])
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via page says license)"

            says_open_url_snippet_patterns = [
                ("projecteuclid.org/",
                 u'<strong>Full-text: Open access</strong>'),
            ]
            for (url_snippet, pattern) in says_open_url_snippet_patterns:
                matches = re.findall(pattern, self.r.content_small(),
                                     re.IGNORECASE)
                if url_snippet in self.r.request.url.lower() and matches:
                    self.scraped_open_metadata_url = self.r.request.url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            says_open_access_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)",
                 u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)",
                 u'<span class="icon access open-access cursorDefault">'),
            ]
            for (publisher, pattern) in says_open_access_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL)
                if self.is_same_publisher(publisher) and matches:
                    self.scraped_license = "implied-oa"
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this is open! took {} seconds [{}]".
                        format(elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this doesn't say open. took {} seconds [{}]"
                        .format(elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
Exemple #21
0
def scrape_for_fulltext_link(url):
    if DEBUG_SCRAPING:
        print u"getting URL: {}".format(url)

    license = "unknown"
    is_journal = is_doi_url(url) or (u"/doi/" in url)

    if u"ncbi.nlm.nih.gov" in url:
        print u"not scraping {} because is on our do not scrape list.".format(
            url)
        if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url:
            # pmc has fulltext
            return (url, license)
        else:
            # is an nlm page but not a pmc page, so is not full text
            return (None, license)

    if DEBUG_SCRAPING:
        print u"in scrape_for_fulltext_link"

    with closing(http_get(url, stream=True, timeout=10)) as r:

        # if our url redirects to a pdf, we're done.
        # = open repo http://hdl.handle.net/2060/20140010374
        if resp_is_pdf(r):
            if DEBUG_SCRAPING:
                print u"the head says this is a PDF. success! [{}]".format(url)
            return (url, license)
        else:
            if DEBUG_SCRAPING:
                print u"head says not a PDF.  continuing more checks"

        # get the HTML tree
        page = r.content
        license = find_normalized_license(page)

        # if they are linking to a .docx or similar, this is open.
        # this only works for repos... a ".doc" in a journal is not the article. example:
        # = closed journal http://doi.org/10.1007/s10822-012-9571-0
        if not is_journal:
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    print u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link, r.url), url)
                return (url, license)

        pdf_download_link = find_pdf_link(page, url)
        if pdf_download_link is not None:
            if DEBUG_SCRAPING:
                print u"found a PDF download link: {} {} [{}]".format(
                    pdf_download_link.href, pdf_download_link.anchor, url)

            pdf_url = get_link_target(pdf_download_link, r.url)
            if is_journal:
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format(
                        url)
                if gets_a_pdf(pdf_download_link, r.url):
                    return (pdf_url, license)
            else:
                return (pdf_url, license)

    if license != "unknown":
        # = open 10.1136/bmj.i2716 cc-by
        # = open 10.1136/bmj.i1209 cc-by-nc
        # print "FOUND A LICENSE!", license, url
        return (None, license)

    if DEBUG_SCRAPING:
        print u"found no PDF download link [{}]".format(url)
    return (None, license)
Exemple #22
0
    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = self.default_version()

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself
        version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile(
            ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL
        ).findall(self.pmh_record.api_raw)

        if version_is_from_strict_metadata or not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)
            # logger.info(text)

            if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata:
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if text and self.scrape_version != 'acceptedVersion':
                patterns = [
                    re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern))
                        self.scrape_version = "acceptedVersion"

                if r and r.url and '61RMIT_INST' in r.url:
                    if 'Version: Accepted' in text:
                        logger.info(u'found Version: Accepted, decided PDF is accepted version')
                        self.scrape_version = "acceptedVersion"

                heading_text = text[0:50].lower()
                accepted_headings = [
                    "final accepted version",
                    "accepted manuscript",
                ]

                for heading in accepted_headings:
                    if heading in heading_text:
                        logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading))
                        self.scrape_version = "acceptedVersion"
                        break

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        if self.pmh_record:
            self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            with closing(
                    http_get(url,
                             stream=True,
                             related_pub=self.related_pub,
                             ask_slowly=self.ask_slowly)) as self.r:

                if self.r.status_code != 200:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if self.is_a_pdf_page():
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"this is a PDF. success! [{}]".format(url))
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"is not a PDF for {}.  continuing more checks".
                            format(url))

                # now before reading the content, bail it too large
                if is_response_too_large(self.r):
                    logger.info(u"landing page is too large, skipping")
                    return

                # get the HTML tree
                page = self.r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                # special exception for citeseer because we want the pdf link where
                # the copy is on the third party repo, not the cached link, if we can get it
                if u"citeseerx.ist.psu.edu/" in url:
                    matches = re.findall(
                        u'<h3>Download Links</h3>.*?href="(.*?)"', page,
                        re.DOTALL)
                    if matches:
                        self.scraped_pdf_url = unicode(matches[0], "utf-8")
                        self.scraped_open_metadata_url = url
                        return

                pdf_download_link = self.find_pdf_link(page)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a PDF download link: {} {} [{}]".format(
                                pdf_download_link.href,
                                pdf_download_link.anchor, url))

                    pdf_url = get_link_target(pdf_download_link.href,
                                              self.r.url)
                    # if they are linking to a PDF, we need to follow the link to make sure it's legit
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"checking to see the PDF link actually gets a PDF [{}]"
                            .format(url))
                    if self.gets_a_pdf(pdf_download_link, self.r.url):
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                doc_link = find_doc_download_link(page)
                if doc_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link.href, self.r.url),
                                url))
                    self.scraped_open_metadata_url = url
                    return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Exemple #24
0
def call_base(products):
    if not products:
        # print "empty product list so not calling base"
        return

    titles = []
    # may be more than one product for a given title, so is a dict of lists
    titles_to_products = defaultdict(list)

    for p in products:
        p.license_string = ""
        p.base_dcoa = None
        p.repo_urls = {"urls": []}

        title = p.best_title
        titles_to_products[normalize(title)].append(p)

        title = title.lower()
        # can't just replace all punctuation because ' replaced with ? gets no hits
        title = title.replace('"', "?")
        title = title.replace('#', "?")
        title = title.replace('=', "?")
        title = title.replace('&', "?")
        title = title.replace('%', "?")
        title = title.replace('-', "*")

        # only bother looking up titles that are at least 3 words long
        title_words = title.split()
        if len(title_words) >= 3:
            # only look up the first 12 words
            title_to_query = u" ".join(title_words[0:12])
            titles.append(title_to_query)

    # now do the lookup in base
    titles_string = u"%20OR%20".join(
        [u'%22{}%22'.format(title) for title in titles])
    # print u"{}: calling base with query string of length {}, utf8 bits {}".format(self.id, len(titles_string), 8*len(titles_string.encode('utf-8')))
    url_template = u"https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi?func=PerformSearch&query=(dcoa:1%20OR%20dcoa:2)%20AND%20dctitle:({titles_string})&fields=dctitle,dccreator,dcyear,dcrights,dcprovider,dcidentifier,dcoa,dclink&hits=100000&format=json"
    url = url_template.format(titles_string=titles_string)

    # print u"calling base with {}".format(url)

    start_time = time()
    proxy_url = os.getenv("STATIC_IP_PROXY")
    proxies = {"https": proxy_url}
    r = None
    try:
        r = requests.get(url, proxies=proxies, timeout=6)
        # print u"** querying with {} titles took {}s".format(len(titles), elapsed(start_time))
    except requests.exceptions.ConnectionError:
        print u"connection error in set_fulltext_urls, skipping."
    except requests.Timeout:
        print u"timeout error in set_fulltext_urls, skipping."

    if r != None and r.status_code != 200:
        print u"problem searching base! status_code={}".format(r.status_code)
        for p in products:
            p.base_dcoa = u"base query error: status_code={}".format(
                r.status_code)

    else:
        try:
            data = r.json()["response"]
            # print "number found:", data["numFound"]
            for doc in data["docs"]:
                base_dcoa = str(doc["dcoa"])
                try:
                    # print "normalize(doc['dctitle'])", normalize(doc["dctitle"]), doc["dctitle"], doc["dcidentifier"]
                    # print "titles", titles
                    matching_products = titles_to_products[normalize(
                        doc["dctitle"])]
                except KeyError:
                    matching_products = []
                for p in matching_products:

                    if base_dcoa == "1":
                        # got a 1 hit.  yay!  overwrite no matter what.
                        if p.fulltext_url:
                            urls_to_choose_from = [p.fulltext_url
                                                   ] + doc["dcidentifier"]
                        else:
                            urls_to_choose_from = doc["dcidentifier"]
                        # print "urls_to_choose_from", urls_to_choose_from
                        p.fulltext_url = pick_best_base_url(
                            urls_to_choose_from)
                        p.evidence = "oa repository (via base-search.net oa url)"
                        p.repo_urls["urls"] = {}
                        p.base_dcoa = base_dcoa
                        if "dcrights" in doc:
                            p.license_string += u"{};".format(doc["dcrights"])

                    elif base_dcoa == "2" and p.base_dcoa != "1":
                        # got a 2 hit.  use only if we don't already have a 1.
                        p.repo_urls["urls"] += doc["dcidentifier"]
                        p.base_dcoa = base_dcoa

        except ValueError:  # includes simplejson.decoder.JSONDecodeError
            print u'decoding JSON has failed base response'
            for p in products:
                p.base_dcoa = u"base lookup error: json response parsing"
        except AttributeError:  # no json
            # print u"no hit with title {}".format(doc["dctitle"])
            # print u"normalized: {}".format(normalize(doc["dctitle"]))
            pass

    if p.repo_urls["urls"]:
        p.repo_urls["urls"] = sorted(p.repo_urls["urls"],
                                     key=lambda x: base_url_sort_score(x))

    for p in products:
        if p.license_string:
            p.license = oa_local.find_normalized_license(p.license_string)
        if p.best_title and (normalize(p.best_title) in BASE_RESULT_OVERRIDE):
            p.fulltext_url = BASE_RESULT_OVERRIDE[normalize(p.best_title)]

    print u"finished base step of set_fulltext_urls with {} titles in {}s".format(
        len(titles_to_products), elapsed(start_time, 2))
Exemple #25
0
    def scrape_for_fulltext_link(self):
        url = self.url
        is_journal = u"/doi/" in url or u"10." in url

        if DEBUG_SCRAPING:
            print u"in scrape_for_fulltext_link, getting URL: {}".format(url)

        if u"ncbi.nlm.nih.gov" in url:
            print u"not scraping {} because is on our do not scrape list.".format(
                url)
            if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url:
                # pmc has fulltext
                self.scraped_open_metadata_url = url
                pmcid_matches = re.findall(".*(PMC\d+).*", url)
                if pmcid_matches:
                    pmcid = pmcid_matches[0]
                    self.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format(
                        pmcid)
            else:
                # is an nlm page but not a pmc page, so is not full text
                return

        try:
            with closing(
                    http_get(url, stream=True, read_timeout=10,
                             doi=self.doi)) as r:

                if is_response_too_large(r):
                    print "landing page is too large, skipping"
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if resp_is_pdf_from_header(r):

                    if DEBUG_SCRAPING:
                        print u"the head says this is a PDF. success! [{}]".format(
                            url)
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        print u"head says not a PDF for {}.  continuing more checks".format(
                            url)

                # get the HTML tree
                page = r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                pdf_download_link = find_pdf_link(page, url)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        print u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url)

                    pdf_url = get_link_target(pdf_download_link, r.url)
                    if is_journal:
                        # if they are linking to a PDF, we need to follow the link to make sure it's legit
                        if DEBUG_SCRAPING:
                            print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format(
                                url)
                        if gets_a_pdf(pdf_download_link, r.url, self.doi):
                            self.scraped_pdf_url = pdf_url
                            self.scraped_open_metadata_url = url
                            return
                    else:
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                # this only works for repos... a ".doc" in a journal is not the article. example:
                # = closed journal http://doi.org/10.1007/s10822-012-9571-0
                if not is_journal:
                    doc_link = find_doc_download_link(page)
                    if doc_link is not None:
                        if DEBUG_SCRAPING:
                            print u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link, r.url), url)
                        self.scraped_open_metadata_url = url
                        return

        except requests.exceptions.ConnectionError:
            print u"ERROR: connection error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.Timeout:
            print u"ERROR: timeout error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.exceptions.InvalidSchema:
            print u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.exceptions.RequestException as e:
            print u"ERROR: RequestException error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return

        if DEBUG_SCRAPING:
            print u"found no PDF download link.  end of the line. [{}]".format(
                url)

        return self
Exemple #26
0
    def scrape_for_fulltext_link(self):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)
            resolved_landing_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.r.url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"landing page is not a PDF for {}.  continuing more checks".format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(u'error parsing html, skipped script removal: {}'.format(e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via free pdf)"

            # now look and see if it is not just free, but open!
            says_open_url_snippet_patterns = [
                ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'),
            ]

            for (url_snippet, pattern) in says_open_url_snippet_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if url_snippet in resolved_landing_url.lower() and matches:
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            says_open_access_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'),
            ]
            for (publisher, pattern) in says_open_access_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL)
                if self.is_same_publisher(publisher) and matches:
                    self.scraped_license = "implied-oa"
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]

            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_license = find_normalized_license(matches[0])
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via page says license)"

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this is open! took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
Exemple #27
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
                u"ncbi.nlm.nih.gov",
                u"europepmc.org",
                u"/europepmc/",
                u"pubmed",
                u"elar.rsvpu.ru",  #these ones based on complaint in email
                u"elib.uraic.ru",
                u"elar.usfeu.ru",
                u"elar.urfu.ru",
                u"elar.uspu.ru"]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(u"not scraping {} because is on our do not scrape list.".format(url))
                return

        try:
            self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"is not a PDF for {}.  continuing more checks".format(url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url), "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a PDF download link: {} {} [{}]".format(
                        pdf_download_link.href, pdf_download_link.anchor, url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(u"found no PDF download link.  end of the line. [{}]".format(url))

        return self
Exemple #28
0
    def scrape_for_fulltext_link(self, find_pdf_link=True):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)
            resolved_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            if is_a_word_doc(self.r):
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a word doc. success! [{}]".format(url))
                self.scraped_open_metadata_url = url
                return

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(
                    u'error parsing html, skipped script removal: {}'.format(
                        e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"',
                                     page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"),
                                                 "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(
                    page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)

            if doc_link is not None:
                absolute_doc_url = get_link_target(doc_link.href, resolved_url)
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a possible .doc download link [{}]".format(
                            absolute_doc_url))
                if self.gets_a_word_doc(doc_link, self.r.url):
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this is a word doc. [{}]".format(
                                absolute_doc_url))
                    self.scraped_open_metadata_url = url
                    return
                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this ain't a word doc. [{}]".
                            format(absolute_doc_url))

            bhl_link = find_bhl_view_link(resolved_url, page)
            if bhl_link is not None:
                logger.info('found a BHL document link: {}'.format(
                    get_link_target(bhl_link.href, resolved_url)))
                self.scraped_open_metadata_url = url
                return

            if _trust_repo_license(resolved_url) and self.scraped_license:
                logger.info(u'trusting license {}'.format(
                    self.scraped_license))
                self.scraped_open_metadata_url = self.url

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self