def is_bronze(self): if self.display_evidence == 'open (via free pdf)': return True if is_doi_url(self.best_url): return clean_doi(self.best_url) == self.doi and not (self.is_gold or self.is_hybrid) return False
def populate(self, pmh_input_record): self.updated = datetime.datetime.utcnow().isoformat() self.id = pmh_input_record.header.identifier self.api_raw = pmh_input_record.raw self.record_timestamp = pmh_input_record.header.datestamp self.title = oai_tag_match("title", pmh_input_record) self.authors = oai_tag_match("creator", pmh_input_record, return_list=True) self.relations = oai_tag_match("relation", pmh_input_record, return_list=True) self.oa = oai_tag_match("oa", pmh_input_record) self.license = oai_tag_match("rights", pmh_input_record) self.sources = oai_tag_match("collname", pmh_input_record, return_list=True) identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True) self.urls = self.get_good_urls(identifier_matches) if not self.urls: self.urls = self.get_good_urls(self.relations) possible_dois = [] if self.relations: possible_dois += [ s for s in self.relations if s and '/*ref*/' not in s ] if identifier_matches: possible_dois += [s for s in identifier_matches if s] if possible_dois: for possible_doi in possible_dois: if (is_doi_url(possible_doi) or possible_doi.startswith(u"doi:") or re.findall(ur"10\.\d", possible_doi)): try: doi_candidate = clean_doi(possible_doi) skip_these_doi_snippets = [ u'10.17605/osf.io', u'10.14279/depositonce', u'/(issn)', u'10.17169/refubium', ] for doi_snippet in skip_these_doi_snippets: if doi_snippet in doi_candidate: doi_candidate = None break if doi_candidate: self.doi = doi_candidate except NoDoiException: pass
def is_bronze(self): if self.best_url and not (self.is_gold or self.is_green) and not self.has_license: return True if is_doi_url(self.best_url): return (clean_doi(self.best_url, return_none_if_error=True) == self.doi and not (self.is_gold or self.is_hybrid)) return False
def is_hybrid(self): # import pdb; pdb.set_trace() if self.display_evidence and self.display_evidence.startswith("open"): return True if is_doi_url(self.best_url): if self.is_gold: return False if clean_doi(self.best_url) == self.doi: return True return False
def is_bronze(self): if self.best_url and not (self.is_gold or self.is_green) and not self.has_open_license: return True if is_doi_url(self.best_url): url_doi = normalize_doi(self.best_url, return_none_if_error=True) unquoted_doi = normalize_doi(unquote(self.best_url), return_none_if_error=True) return (self.doi in (url_doi, unquoted_doi) and not (self.is_gold or self.is_hybrid or self.is_green)) return False
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns == "DOI": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url try: id_string = str(orcid_product_dict['url']['value'].encode('utf-8')).lower() if is_doi_url(id_string): doi = clean_doi(id_string) # throws error unless valid DOI except (TypeError, NoDoiException): doi = None return doi
def populate(self, pmh_input_record): self.updated = datetime.datetime.utcnow().isoformat() self.id = pmh_input_record.header.identifier self.api_raw = pmh_input_record.raw self.record_timestamp = pmh_input_record.header.datestamp self.title = oai_tag_match("title", pmh_input_record) self.authors = oai_tag_match("creator", pmh_input_record, return_list=True) self.relations = oai_tag_match("relation", pmh_input_record, return_list=True) self.oa = oai_tag_match("oa", pmh_input_record) self.license = oai_tag_match("rights", pmh_input_record) self.sources = oai_tag_match("collname", pmh_input_record, return_list=True) identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True) self.urls = self.get_good_urls(identifier_matches) if not self.urls: self.urls = self.get_good_urls(self.relations) possible_dois = [] if identifier_matches: possible_dois += [s for s in identifier_matches if s] if self.relations: possible_dois += [s for s in self.relations if s] if possible_dois: for possible_doi in possible_dois: if (is_doi_url(possible_doi) or possible_doi.startswith(u"doi:") or re.findall(u"10\./d", possible_doi)): try: self.doi = clean_doi(possible_doi) dont_use_these_doi_snippets = [u"10.17605/osf.io"] for doi_snippet in dont_use_these_doi_snippets: if self.doi and doi_snippet in self.doi: self.doi = None except NoDoiException: pass self.doi = self._doi_override_by_id().get(self.id, self.doi)
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns == "DOI": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url try: id_string = str( orcid_product_dict['url']['value'].encode('utf-8')).lower() if is_doi_url(id_string): doi = clean_doi(id_string) # throws error unless valid DOI except (TypeError, NoDoiException): doi = None return doi
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns.lower() == "doi": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): try: if is_doi_url(nid): doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass return doi
def call_pmh_endpoint(self, first=None, last=None, chunk_size=10, scrape=False): args = {} args['metadataPrefix'] = 'oai_dc' if "citeseerx" in self.pmh_url: proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url, "http": proxy_url} else: proxies = {} my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120) logger.info(u"connected to sickle with {} {}".format( self.pmh_url, proxies)) args['from'] = first if last: args["until"] = last records_to_save = [] logger.info(u"calling ListRecords with {} {}".format( self.pmh_url, args)) try: pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args) logger.info(u"got pmh_records with {} {}".format( self.pmh_url, args)) pmh_input_record = safe_get_next_record(pmh_records) except Exception as e: logger.info(u"no records with {} {}".format(self.pmh_url, args)) # logger.exception(u"no records with {} {}".format(self.pmh_url, args)) pmh_input_record = None while pmh_input_record: my_pmh_record = pmh_record.PmhRecord() my_pmh_record.id = pmh_input_record.header.identifier my_pmh_record.api_raw = pmh_input_record.raw my_pmh_record.record_timestamp = pmh_input_record.header.datestamp my_pmh_record.title = oai_tag_match("title", pmh_input_record) my_pmh_record.authors = oai_tag_match("creator", pmh_input_record, return_list=True) my_pmh_record.oa = oai_tag_match("oa", pmh_input_record) my_pmh_record.urls = oai_tag_match("identifier", pmh_input_record, return_list=True) for fulltext_url in my_pmh_record.urls: if fulltext_url and (is_doi_url(fulltext_url) or fulltext_url.startswith(u"doi:") or re.findall(u"10\.", fulltext_url)): try: my_pmh_record.doi = clean_doi(fulltext_url) except NoDoiException: pass my_pmh_record.license = oai_tag_match("rights", pmh_input_record) my_pmh_record.relations = oai_tag_match("relation", pmh_input_record, return_list=True) my_pmh_record.sources = oai_tag_match("collname", pmh_input_record, return_list=True) my_pmh_record.source = self.id if is_complete(my_pmh_record): db.session.merge(my_pmh_record) my_pages = my_pmh_record.mint_pages() logger.info(u"made {} pages for id {}".format( len(my_pages), my_pmh_record.id)) for my_page in my_pages: if scrape: logger.info(u"scraping pages") my_page.scrape() db.session.merge(my_page) records_to_save.append(my_pmh_record) # logger.info(u":") logger.info(u"my_pmh_record {}".format( my_pmh_record.get_good_urls())) else: logger.info(u"not complete") if len(records_to_save) >= chunk_size: last_record = records_to_save[-1] logger.info(u"last record saved: {} for {}".format( last_record.id, self.id)) safe_commit(db) records_to_save = [] pmh_input_record = safe_get_next_record(pmh_records) # make sure to get the last ones if records_to_save: last_record = records_to_save[-1] logger.info( u"saving {} last ones, last record saved: {} for {}".format( len(records_to_save), last_record.id, self.id)) safe_commit(db) logger.info(u"done everything for {}".format(self.id))
def scrape_for_fulltext_link(url): if DEBUG_SCRAPING: print u"getting URL: {}".format(url) license = "unknown" is_journal = is_doi_url(url) or (u"/doi/" in url) if u"ncbi.nlm.nih.gov" in url: print u"not scraping {} because is on our do not scrape list.".format( url) if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url: # pmc has fulltext return (url, license) else: # is an nlm page but not a pmc page, so is not full text return (None, license) if DEBUG_SCRAPING: print u"in scrape_for_fulltext_link" with closing(http_get(url, stream=True, timeout=10)) as r: # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if resp_is_pdf(r): if DEBUG_SCRAPING: print u"the head says this is a PDF. success! [{}]".format(url) return (url, license) else: if DEBUG_SCRAPING: print u"head says not a PDF. continuing more checks" # get the HTML tree page = r.content license = find_normalized_license(page) # if they are linking to a .docx or similar, this is open. # this only works for repos... a ".doc" in a journal is not the article. example: # = closed journal http://doi.org/10.1007/s10822-012-9571-0 if not is_journal: doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: print u"found a .doc download link {} [{}]".format( get_link_target(doc_link, r.url), url) return (url, license) pdf_download_link = find_pdf_link(page, url) if pdf_download_link is not None: if DEBUG_SCRAPING: print u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url) pdf_url = get_link_target(pdf_download_link, r.url) if is_journal: # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format( url) if gets_a_pdf(pdf_download_link, r.url): return (pdf_url, license) else: return (pdf_url, license) if license != "unknown": # = open 10.1136/bmj.i2716 cc-by # = open 10.1136/bmj.i1209 cc-by-nc # print "FOUND A LICENSE!", license, url return (None, license) if DEBUG_SCRAPING: print u"found no PDF download link [{}]".format(url) return (None, license)
def populate(self, endpoint_id, pmh_input_record, metadata_prefix='oai_dc'): self.updated = datetime.datetime.utcnow().isoformat() self.id = u'{}:{}'.format(endpoint_id, pmh_input_record.header.identifier) self.endpoint_id = endpoint_id self.pmh_id = pmh_input_record.header.identifier self.api_raw = pmh_input_record.raw self.record_timestamp = pmh_input_record.header.datestamp self.title = oai_tag_match("title", pmh_input_record) self.authors = oai_tag_match("creator", pmh_input_record, return_list=True) self.relations = oai_tag_match("relation", pmh_input_record, return_list=True) self.oa = oai_tag_match("oa", pmh_input_record) if metadata_prefix == 'qdc': self.license = oai_tag_match("rights.license", pmh_input_record) else: self.license = oai_tag_match("rights", pmh_input_record) self.sources = oai_tag_match("collname", pmh_input_record, return_list=True) identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True) if self.pmh_id and self.pmh_id.startswith('oai:authors.library.caltech.edu'): identifier_matches = [] if self.pmh_id and self.pmh_id.startswith('oai:deepblue.lib.umich.edu'): # lots of identifiers and this item's is first identifier_matches.reverse() identifier_doi_matches = oai_tag_match("identifier.doi", pmh_input_record, return_list=True) self.urls = self.get_good_urls(identifier_matches) if not self.urls: self.urls = self.get_good_urls(self.relations) possible_dois = [] if self.relations: possible_dois += [s for s in self.relations if s and '/*ref*/' not in s and not s.startswith('reference')] if self.bare_pmh_id and self.bare_pmh_id.startswith('oai:openarchive.ki.se:'): # ticket 22247, relation DOIs are only for this article with this prefix possible_dois = [s for s in possible_dois if s.startswith('info:eu-repo/semantics/altIdentifier/doi/')] if identifier_matches: possible_dois += [s for s in identifier_matches if s] if identifier_doi_matches: possible_dois += [s for s in identifier_doi_matches if s] if possible_dois: for possible_doi in possible_dois: if ( is_doi_url(possible_doi) or possible_doi.startswith(u"doi:") or re.findall(ur"10\.\d", possible_doi) ): try: doi_candidate = clean_doi(possible_doi) if not doi_candidate: continue skip_these_doi_snippets = [ u'10.17605/osf.io', u'10.14279/depositonce', u'/(issn)', u'10.17169/refubium', ] skip_these_dois = [ '10.1002/9781118786352', # journal ] for doi_snippet in skip_these_doi_snippets: if doi_snippet.lower() in doi_candidate.lower(): doi_candidate = None break for skip_doi in skip_these_dois: if skip_doi and doi_candidate and skip_doi.lower() == doi_candidate.lower(): doi_candidate = None break if doi_candidate: self.doi = doi_candidate except NoDoiException: pass