Ejemplo n.º 1
0
    def is_bronze(self):
        if self.display_evidence == 'open (via free pdf)':
            return True

        if is_doi_url(self.best_url):
            return clean_doi(self.best_url) == self.doi and not (self.is_gold or self.is_hybrid)

        return False
Ejemplo n.º 2
0
    def populate(self, pmh_input_record):
        self.updated = datetime.datetime.utcnow().isoformat()
        self.id = pmh_input_record.header.identifier
        self.api_raw = pmh_input_record.raw
        self.record_timestamp = pmh_input_record.header.datestamp
        self.title = oai_tag_match("title", pmh_input_record)
        self.authors = oai_tag_match("creator",
                                     pmh_input_record,
                                     return_list=True)
        self.relations = oai_tag_match("relation",
                                       pmh_input_record,
                                       return_list=True)
        self.oa = oai_tag_match("oa", pmh_input_record)
        self.license = oai_tag_match("rights", pmh_input_record)
        self.sources = oai_tag_match("collname",
                                     pmh_input_record,
                                     return_list=True)
        identifier_matches = oai_tag_match("identifier",
                                           pmh_input_record,
                                           return_list=True)
        self.urls = self.get_good_urls(identifier_matches)
        if not self.urls:
            self.urls = self.get_good_urls(self.relations)

        possible_dois = []

        if self.relations:
            possible_dois += [
                s for s in self.relations if s and '/*ref*/' not in s
            ]
        if identifier_matches:
            possible_dois += [s for s in identifier_matches if s]

        if possible_dois:
            for possible_doi in possible_dois:
                if (is_doi_url(possible_doi)
                        or possible_doi.startswith(u"doi:")
                        or re.findall(ur"10\.\d", possible_doi)):
                    try:
                        doi_candidate = clean_doi(possible_doi)

                        skip_these_doi_snippets = [
                            u'10.17605/osf.io',
                            u'10.14279/depositonce',
                            u'/(issn)',
                            u'10.17169/refubium',
                        ]
                        for doi_snippet in skip_these_doi_snippets:
                            if doi_snippet in doi_candidate:
                                doi_candidate = None
                                break

                        if doi_candidate:
                            self.doi = doi_candidate
                    except NoDoiException:
                        pass
Ejemplo n.º 3
0
    def is_bronze(self):
        if self.best_url and not (self.is_gold
                                  or self.is_green) and not self.has_license:
            return True

        if is_doi_url(self.best_url):
            return (clean_doi(self.best_url, return_none_if_error=True)
                    == self.doi and not (self.is_gold or self.is_hybrid))

        return False
Ejemplo n.º 4
0
    def is_hybrid(self):
        # import pdb; pdb.set_trace()

        if self.display_evidence and self.display_evidence.startswith("open"):
            return True

        if is_doi_url(self.best_url):
            if self.is_gold:
                return False
            if clean_doi(self.best_url) == self.doi:
                return True
        return False
Ejemplo n.º 5
0
    def is_bronze(self):
        if self.best_url and not (self.is_gold or
                                  self.is_green) and not self.has_open_license:
            return True

        if is_doi_url(self.best_url):
            url_doi = normalize_doi(self.best_url, return_none_if_error=True)
            unquoted_doi = normalize_doi(unquote(self.best_url),
                                         return_none_if_error=True)

            return (self.doi in (url_doi, unquoted_doi)
                    and not (self.is_gold or self.is_hybrid or self.is_green))

        return False
Ejemplo n.º 6
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns == "DOI":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:
        # try url
        try:
            id_string = str(orcid_product_dict['url']['value'].encode('utf-8')).lower()
            if is_doi_url(id_string):
                doi = clean_doi(id_string)  # throws error unless valid DOI
        except (TypeError, NoDoiException):
            doi = None
    return doi
Ejemplo n.º 7
0
    def populate(self, pmh_input_record):
        self.updated = datetime.datetime.utcnow().isoformat()
        self.id = pmh_input_record.header.identifier
        self.api_raw = pmh_input_record.raw
        self.record_timestamp = pmh_input_record.header.datestamp
        self.title = oai_tag_match("title", pmh_input_record)
        self.authors = oai_tag_match("creator",
                                     pmh_input_record,
                                     return_list=True)
        self.relations = oai_tag_match("relation",
                                       pmh_input_record,
                                       return_list=True)
        self.oa = oai_tag_match("oa", pmh_input_record)
        self.license = oai_tag_match("rights", pmh_input_record)
        self.sources = oai_tag_match("collname",
                                     pmh_input_record,
                                     return_list=True)
        identifier_matches = oai_tag_match("identifier",
                                           pmh_input_record,
                                           return_list=True)
        self.urls = self.get_good_urls(identifier_matches)
        if not self.urls:
            self.urls = self.get_good_urls(self.relations)

        possible_dois = []
        if identifier_matches:
            possible_dois += [s for s in identifier_matches if s]
        if self.relations:
            possible_dois += [s for s in self.relations if s]
        if possible_dois:
            for possible_doi in possible_dois:
                if (is_doi_url(possible_doi)
                        or possible_doi.startswith(u"doi:")
                        or re.findall(u"10\./d", possible_doi)):
                    try:
                        self.doi = clean_doi(possible_doi)
                        dont_use_these_doi_snippets = [u"10.17605/osf.io"]
                        for doi_snippet in dont_use_these_doi_snippets:
                            if self.doi and doi_snippet in self.doi:
                                self.doi = None
                    except NoDoiException:
                        pass

        self.doi = self._doi_override_by_id().get(self.id, self.doi)
Ejemplo n.º 8
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns == "DOI":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:
        # try url
        try:
            id_string = str(
                orcid_product_dict['url']['value'].encode('utf-8')).lower()
            if is_doi_url(id_string):
                doi = clean_doi(id_string)  # throws error unless valid DOI
        except (TypeError, NoDoiException):
            doi = None
    return doi
Ejemplo n.º 9
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns.lower() == "doi":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:

        # try url
        for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
            try:
                if is_doi_url(nid):
                    doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass


    return doi
Ejemplo n.º 10
0
    def call_pmh_endpoint(self,
                          first=None,
                          last=None,
                          chunk_size=10,
                          scrape=False):

        args = {}
        args['metadataPrefix'] = 'oai_dc'

        if "citeseerx" in self.pmh_url:
            proxy_url = os.getenv("STATIC_IP_PROXY")
            proxies = {"https": proxy_url, "http": proxy_url}
        else:
            proxies = {}

        my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120)
        logger.info(u"connected to sickle with {} {}".format(
            self.pmh_url, proxies))

        args['from'] = first
        if last:
            args["until"] = last

        records_to_save = []

        logger.info(u"calling ListRecords with {} {}".format(
            self.pmh_url, args))
        try:
            pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args)
            logger.info(u"got pmh_records with {} {}".format(
                self.pmh_url, args))
            pmh_input_record = safe_get_next_record(pmh_records)
        except Exception as e:
            logger.info(u"no records with {} {}".format(self.pmh_url, args))
            # logger.exception(u"no records with {} {}".format(self.pmh_url, args))
            pmh_input_record = None

        while pmh_input_record:

            my_pmh_record = pmh_record.PmhRecord()

            my_pmh_record.id = pmh_input_record.header.identifier
            my_pmh_record.api_raw = pmh_input_record.raw
            my_pmh_record.record_timestamp = pmh_input_record.header.datestamp
            my_pmh_record.title = oai_tag_match("title", pmh_input_record)
            my_pmh_record.authors = oai_tag_match("creator",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.oa = oai_tag_match("oa", pmh_input_record)
            my_pmh_record.urls = oai_tag_match("identifier",
                                               pmh_input_record,
                                               return_list=True)
            for fulltext_url in my_pmh_record.urls:
                if fulltext_url and (is_doi_url(fulltext_url)
                                     or fulltext_url.startswith(u"doi:")
                                     or re.findall(u"10\.", fulltext_url)):
                    try:
                        my_pmh_record.doi = clean_doi(fulltext_url)
                    except NoDoiException:
                        pass

            my_pmh_record.license = oai_tag_match("rights", pmh_input_record)
            my_pmh_record.relations = oai_tag_match("relation",
                                                    pmh_input_record,
                                                    return_list=True)
            my_pmh_record.sources = oai_tag_match("collname",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.source = self.id

            if is_complete(my_pmh_record):
                db.session.merge(my_pmh_record)
                my_pages = my_pmh_record.mint_pages()
                logger.info(u"made {} pages for id {}".format(
                    len(my_pages), my_pmh_record.id))
                for my_page in my_pages:
                    if scrape:
                        logger.info(u"scraping pages")
                        my_page.scrape()
                    db.session.merge(my_page)
                records_to_save.append(my_pmh_record)
                # logger.info(u":")
                logger.info(u"my_pmh_record {}".format(
                    my_pmh_record.get_good_urls()))
            else:
                logger.info(u"not complete")

            if len(records_to_save) >= chunk_size:
                last_record = records_to_save[-1]
                logger.info(u"last record saved: {} for {}".format(
                    last_record.id, self.id))
                safe_commit(db)
                records_to_save = []

            pmh_input_record = safe_get_next_record(pmh_records)

        # make sure to get the last ones
        if records_to_save:
            last_record = records_to_save[-1]
            logger.info(
                u"saving {} last ones, last record saved: {} for {}".format(
                    len(records_to_save), last_record.id, self.id))
            safe_commit(db)
        logger.info(u"done everything for {}".format(self.id))
Ejemplo n.º 11
0
def scrape_for_fulltext_link(url):
    if DEBUG_SCRAPING:
        print u"getting URL: {}".format(url)

    license = "unknown"
    is_journal = is_doi_url(url) or (u"/doi/" in url)

    if u"ncbi.nlm.nih.gov" in url:
        print u"not scraping {} because is on our do not scrape list.".format(
            url)
        if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url:
            # pmc has fulltext
            return (url, license)
        else:
            # is an nlm page but not a pmc page, so is not full text
            return (None, license)

    if DEBUG_SCRAPING:
        print u"in scrape_for_fulltext_link"

    with closing(http_get(url, stream=True, timeout=10)) as r:

        # if our url redirects to a pdf, we're done.
        # = open repo http://hdl.handle.net/2060/20140010374
        if resp_is_pdf(r):
            if DEBUG_SCRAPING:
                print u"the head says this is a PDF. success! [{}]".format(url)
            return (url, license)
        else:
            if DEBUG_SCRAPING:
                print u"head says not a PDF.  continuing more checks"

        # get the HTML tree
        page = r.content
        license = find_normalized_license(page)

        # if they are linking to a .docx or similar, this is open.
        # this only works for repos... a ".doc" in a journal is not the article. example:
        # = closed journal http://doi.org/10.1007/s10822-012-9571-0
        if not is_journal:
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    print u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link, r.url), url)
                return (url, license)

        pdf_download_link = find_pdf_link(page, url)
        if pdf_download_link is not None:
            if DEBUG_SCRAPING:
                print u"found a PDF download link: {} {} [{}]".format(
                    pdf_download_link.href, pdf_download_link.anchor, url)

            pdf_url = get_link_target(pdf_download_link, r.url)
            if is_journal:
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format(
                        url)
                if gets_a_pdf(pdf_download_link, r.url):
                    return (pdf_url, license)
            else:
                return (pdf_url, license)

    if license != "unknown":
        # = open 10.1136/bmj.i2716 cc-by
        # = open 10.1136/bmj.i1209 cc-by-nc
        # print "FOUND A LICENSE!", license, url
        return (None, license)

    if DEBUG_SCRAPING:
        print u"found no PDF download link [{}]".format(url)
    return (None, license)
Ejemplo n.º 12
0
    def populate(self, endpoint_id, pmh_input_record, metadata_prefix='oai_dc'):
        self.updated = datetime.datetime.utcnow().isoformat()
        self.id = u'{}:{}'.format(endpoint_id, pmh_input_record.header.identifier)
        self.endpoint_id = endpoint_id
        self.pmh_id = pmh_input_record.header.identifier
        self.api_raw = pmh_input_record.raw
        self.record_timestamp = pmh_input_record.header.datestamp
        self.title = oai_tag_match("title", pmh_input_record)
        self.authors = oai_tag_match("creator", pmh_input_record, return_list=True)
        self.relations = oai_tag_match("relation", pmh_input_record, return_list=True)
        self.oa = oai_tag_match("oa", pmh_input_record)

        if metadata_prefix == 'qdc':
            self.license = oai_tag_match("rights.license", pmh_input_record)
        else:
            self.license = oai_tag_match("rights", pmh_input_record)

        self.sources = oai_tag_match("collname", pmh_input_record, return_list=True)

        identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True)
        if self.pmh_id and self.pmh_id.startswith('oai:authors.library.caltech.edu'):
            identifier_matches = []

        if self.pmh_id and self.pmh_id.startswith('oai:deepblue.lib.umich.edu'):
            # lots of identifiers and this item's is first
            identifier_matches.reverse()

        identifier_doi_matches = oai_tag_match("identifier.doi", pmh_input_record, return_list=True)
        self.urls = self.get_good_urls(identifier_matches)

        if not self.urls:
            self.urls = self.get_good_urls(self.relations)

        possible_dois = []

        if self.relations:
            possible_dois += [s for s in self.relations if s and '/*ref*/' not in s and not s.startswith('reference')]

            if self.bare_pmh_id and self.bare_pmh_id.startswith('oai:openarchive.ki.se:'):
                # ticket 22247, relation DOIs are only for this article with this prefix
                possible_dois = [s for s in possible_dois if s.startswith('info:eu-repo/semantics/altIdentifier/doi/')]

        if identifier_matches:
            possible_dois += [s for s in identifier_matches if s]
        if identifier_doi_matches:
            possible_dois += [s for s in identifier_doi_matches if s]

        if possible_dois:
            for possible_doi in possible_dois:
                if (
                    is_doi_url(possible_doi)
                    or possible_doi.startswith(u"doi:")
                    or re.findall(ur"10\.\d", possible_doi)
                ):
                    try:
                        doi_candidate = clean_doi(possible_doi)

                        if not doi_candidate:
                            continue

                        skip_these_doi_snippets = [
                            u'10.17605/osf.io',
                            u'10.14279/depositonce',
                            u'/(issn)',
                            u'10.17169/refubium',
                        ]
                        skip_these_dois = [
                            '10.1002/9781118786352',  # journal
                        ]
                        for doi_snippet in skip_these_doi_snippets:
                            if doi_snippet.lower() in doi_candidate.lower():
                                doi_candidate = None
                                break

                        for skip_doi in skip_these_dois:
                            if skip_doi and doi_candidate and skip_doi.lower() == doi_candidate.lower():
                                doi_candidate = None
                                break

                        if doi_candidate:
                            self.doi = doi_candidate
                    except NoDoiException:
                        pass