def reported_noncompliant_url_fragments(dirty_doi): if not dirty_doi: return [] lookup_normalized = {} for (doi_key, fragment_list) in lookup_raw.iteritems(): lookup_normalized[clean_doi(doi_key)] = [noncompliant_url_fragment.lower() for noncompliant_url_fragment in fragment_list] return lookup_normalized.get(clean_doi(dirty_doi), [])
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [ c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c ] q = db.session.query(pub.Pub.response_jsonb).filter( pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows if row[0]] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [ d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois ] placeholder_responses = [ pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois ] responses = pub_responses + placeholder_responses # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") # save csv csv_dicts = [ pub.csv_dict_from_response_dict(my_dict) for my_dict in responses ] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, ["output.csv", "output.jsonl"]) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def set_content_url(self, input): has_doi = False if input.startswith("10."): has_doi = True elif self.content_url: if self.content_url.startswith( "http") and "doi.org/10." in self.content_url: has_doi = True return elif input.startswith("http") and "doi.org/10." in input: has_doi = True elif self.extract_doi(input): has_doi = True if not has_doi: return input = self.extract_doi(input) # print "has_doi", has_doi, input[0:10] try: doi = clean_doi(input) except Exception: print("no doi found for {}".format(input)) return doi_url = "https://doi.org/{}".format(doi) self.content_url = doi_url
def is_bronze(self): if self.display_evidence == 'open (via free pdf)': return True if is_doi_url(self.best_url): return clean_doi(self.best_url) == self.doi and not (self.is_gold or self.is_hybrid) return False
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c] q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois] placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois] responses = pub_responses + placeholder_responses # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") # save csv csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, ["output.csv", "output.jsonl"]) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns == "DOI": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url try: id_string = str(orcid_product_dict['url']['value'].encode('utf-8')).lower() if is_doi_url(id_string): doi = clean_doi(id_string) # throws error unless valid DOI except (TypeError, NoDoiException): doi = None return doi
def populate(self, pmh_input_record): self.updated = datetime.datetime.utcnow().isoformat() self.id = pmh_input_record.header.identifier self.api_raw = pmh_input_record.raw self.record_timestamp = pmh_input_record.header.datestamp self.title = oai_tag_match("title", pmh_input_record) self.authors = oai_tag_match("creator", pmh_input_record, return_list=True) self.relations = oai_tag_match("relation", pmh_input_record, return_list=True) self.oa = oai_tag_match("oa", pmh_input_record) self.license = oai_tag_match("rights", pmh_input_record) self.sources = oai_tag_match("collname", pmh_input_record, return_list=True) identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True) self.urls = self.get_good_urls(identifier_matches) if not self.urls: self.urls = self.get_good_urls(self.relations) possible_dois = [] if self.relations: possible_dois += [ s for s in self.relations if s and '/*ref*/' not in s ] if identifier_matches: possible_dois += [s for s in identifier_matches if s] if possible_dois: for possible_doi in possible_dois: if (is_doi_url(possible_doi) or possible_doi.startswith(u"doi:") or re.findall(ur"10\.\d", possible_doi)): try: doi_candidate = clean_doi(possible_doi) skip_these_doi_snippets = [ u'10.17605/osf.io', u'10.14279/depositonce', u'/(issn)', u'10.17169/refubium', ] for doi_snippet in skip_these_doi_snippets: if doi_snippet in doi_candidate: doi_candidate = None break if doi_candidate: self.doi = doi_candidate except NoDoiException: pass
def post_gs_cache(**kwargs): my_doi = clean_doi(kwargs["doi"]) q = Gs.query.filter(Gs.doi == my_doi, Gs.landing_page_url == kwargs["landing_page_url"]) my_gs = q.first() if not my_gs: my_gs = Gs(**kwargs) db.session.add(my_gs) safe_commit(db) return my_gs
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns == "DOI": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url try: id_string = str( orcid_product_dict['url']['value'].encode('utf-8')).lower() if is_doi_url(id_string): doi = clean_doi(id_string) # throws error unless valid DOI except (TypeError, NoDoiException): doi = None return doi
def get_doi_endpoint(doi): my_doi = Doi(clean_doi(doi)) if my_doi.is_cached_not_expired(): # responses with many events are cached in the database response = my_doi.cached_response() else: my_doi.get() response = my_doi.to_dict() my_doi.save_to_cache(response) return jsonify(response)
def is_bronze(self): if self.best_url and not (self.is_gold or self.is_green) and not self.has_license: return True if is_doi_url(self.best_url): return (clean_doi(self.best_url, return_none_if_error=True) == self.doi and not (self.is_gold or self.is_hybrid)) return False
def get_gs_cache(dirty_doi): my_doi = clean_doi(dirty_doi) # return the best one we've got, so null urls are last my_gs = Gs.query.filter(Gs.doi == my_doi).order_by( Gs.landing_page_url.desc().nullslast()).first() # if my_gs: # my_gs.num_hits +=1 # safe_commit(db) return my_gs
def is_hybrid(self): # import pdb; pdb.set_trace() if self.display_evidence and self.display_evidence.startswith("open"): return True if is_doi_url(self.best_url): if self.is_gold: return False if clean_doi(self.best_url) == self.doi: return True return False
def get_doi_from_biblio_dict(orcid_product_dict): doi = None for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): if ns.lower() == "doi": try: doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass if not doi: # try url for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict): try: if is_doi_url(nid): doi = clean_doi(nid) # throws error unless valid DOI except (TypeError, NoDoiException): pass return doi
def reset_vars(self): if self.id and self.id.startswith("10."): self.id = clean_doi(self.id) self.license = None self.free_metadata_url = None self.free_pdf_url = None self.fulltext_url = None self.oa_color = None self.evidence = None self.open_locations = [] self.closed_urls = [] self.session_id = None self.version = None
def lookup_product(**biblio): my_pub = None if "doi" in biblio and biblio["doi"]: doi = clean_doi(biblio["doi"]) my_pub = Pub.query.get(doi) if my_pub: logger.info(u"found {} in pub db table!".format(my_pub.id)) my_pub.reset_vars() else: raise NoDoiException # my_pub = Crossref(**biblio) # logger.info(u"didn't find {} in crossref db table".format(my_pub)) return my_pub
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True number_added = 0 while has_more_responses: has_more_responses = False start_time = time() url = base_url.format( first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds. url: {}".format(elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"} if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True number_added = 0 while has_more_responses: has_more_responses = False start_time = time() url = base_url.format( first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds. url: {}".format(elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
def strip_junk_from_end_of_doi(self, doi): doi = re.sub("\s+", "", doi) if '">' in doi: doi = doi.split('">')[0] if "</a>" in doi: doi = doi.split("</a>")[0] doi = doi.strip( ",") # has to be first, because comma would be last item on line doi = doi.strip( "." ) # has to be near first, because period would be last item on line doi = doi.strip("'") doi = doi.strip('"') doi = doi.strip("}") doi = clean_doi(doi).lower() return doi
def simple_query_tool(): body = request.json return_type = body.get("return_type", "csv") dirty_dois_list = body["dois"] clean_dois = [ clean_doi(dirty_doi, return_none_if_error=True) for dirty_doi in dirty_dois_list ] clean_dois = [doi for doi in clean_dois if doi] q = db.session.query(pub.Pub.response_jsonb).filter( pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows] # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in pub_responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") # save csv csv_dicts = [ pub.csv_dict_from_response_dict(my_dict) for my_dict in pub_responses ] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, ["output.csv", "output.jsonl"]) send(email, for_real=True) # @todo make sure in the return dict that there is a row for every doi # even those not in our db return jsonify({"got it": email_address, "dois": clean_dois})
def run_update(parsed_args): update = update_registry.get(parsed_args.fn) start = time() #convenience method for handling an doi if parsed_args.doi: from pub import Pub from util import clean_doi my_pub = db.session.query(Pub).filter(Pub.id==clean_doi(parsed_args.doi)).first() parsed_args.id = my_pub.id logger.info(u"Got database hit for this doi: {}".format(my_pub.id)) update.run(**vars(parsed_args)) db.session.remove() logger.info(u"finished update in {} secconds".format(elapsed(start)))
def __init__(self, **kwargs): self.request_kwargs = kwargs self.base_dcoa = None self.repo_urls = {"urls": []} self.license_string = "" self.id = shortuuid.uuid()[0:10] self.created = datetime.datetime.utcnow() self.updated = datetime.datetime.utcnow() for (k, v) in kwargs.iteritems(): if v: value = v.strip() setattr(self, k, value) if self.doi: self.doi = clean_doi(self.doi) self.url = u"http://doi.org/{}".format(self.doi)
def populate(self, pmh_input_record): self.updated = datetime.datetime.utcnow().isoformat() self.id = pmh_input_record.header.identifier self.api_raw = pmh_input_record.raw self.record_timestamp = pmh_input_record.header.datestamp self.title = oai_tag_match("title", pmh_input_record) self.authors = oai_tag_match("creator", pmh_input_record, return_list=True) self.relations = oai_tag_match("relation", pmh_input_record, return_list=True) self.oa = oai_tag_match("oa", pmh_input_record) self.license = oai_tag_match("rights", pmh_input_record) self.sources = oai_tag_match("collname", pmh_input_record, return_list=True) identifier_matches = oai_tag_match("identifier", pmh_input_record, return_list=True) self.urls = self.get_good_urls(identifier_matches) if not self.urls: self.urls = self.get_good_urls(self.relations) possible_dois = [] if identifier_matches: possible_dois += [s for s in identifier_matches if s] if self.relations: possible_dois += [s for s in self.relations if s] if possible_dois: for possible_doi in possible_dois: if (is_doi_url(possible_doi) or possible_doi.startswith(u"doi:") or re.findall(u"10\./d", possible_doi)): try: self.doi = clean_doi(possible_doi) dont_use_these_doi_snippets = [u"10.17605/osf.io"] for doi_snippet in dont_use_these_doi_snippets: if self.doi and doi_snippet in self.doi: self.doi = None except NoDoiException: pass self.doi = self._doi_override_by_id().get(self.id, self.doi)
def run(parsed_args, job_type): start = time() if job_type in ("normal", "hybrid"): update = update_registry.get("WeeklyStats." + process_name(job_type)) if parsed_args.doi: parsed_args.id = clean_doi(parsed_args.doi) parsed_args.doi = None else: update = update_registry.get("DateRange.get_events") update.run(**vars(parsed_args)) logger.info("finished update in {} seconds".format(elapsed(start))) if job_type in ("normal", "hybrid"): from event import CedEvent my_event = CedEvent.query.get(parsed_args.id) pprint(my_event)
def run_update(parsed_args): update = update_registry.get(parsed_args.fn) start = time() #convenience method for handling an doi if parsed_args.doi: from pub import Pub from util import clean_doi my_pub = db.session.query(Pub).filter( Pub.id == clean_doi(parsed_args.doi)).first() parsed_args.id = my_pub.id logger.info(u"Got database hit for this doi: {}".format(my_pub.id)) update.run(**vars(parsed_args)) db.session.remove() logger.info(u"finished update in {} secconds".format(elapsed(start)))
def worker_run(self, **kwargs): run_class = Pub single_obj_id = kwargs.get("id", None) chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) if limit is None: limit = float("inf") if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [run_class.query.filter(run_class.id == single_obj_id).first()] extract_pub_pdf_urls(objects) else: index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size) if not objects: sleep(5) continue object_ids = [obj.id for obj in objects] extract_pub_pdf_urls(objects) object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") # sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=self.table_name(None), ids=object_ids_str ) run_sql(db, sql_command) index += 1 num_updated += len(objects) self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def run(parsed_args, job_type): start = time() if job_type in ("normal", "hybrid"): update = update_registry.get("Pub."+process_name(job_type)) if parsed_args.doi: parsed_args.id = clean_doi(parsed_args.doi) parsed_args.doi = None else: update = update_registry.get("DateRange.get_unpaywall_events") # update = update_registry.get("DateRange.get_pmh_events") update.run(**vars(parsed_args)) logger.info(u"finished update in {} seconds".format(elapsed(start))) resp = None if job_type in ("normal", "hybrid"): my_pub = Pub.query.get(parsed_args.id) resp = my_pub.response_jsonb pprint(resp) return resp
def get_pub_by_doi(my_doi): my_clean_doi = clean_doi(my_doi) # print my_clean_doi query = db.session.query(PubDoi).filter( PubDoi.doi == my_clean_doi).options(orm.undefer_group('full')) # print query my_pub = query.first() # print my_pub if not my_pub: abort_json( 404, u"'{}' is an invalid doi. See https://doi.org/{}".format( my_clean_doi, my_clean_doi)) my_pub_list = PubList(pubs=[my_pub]) my_pub_list.set_dandelions() my_pub_list.set_pictures() results = my_pub_list.to_dict_serp_list() return jsonify({ "results": my_pub_list.to_dict_serp_list(), "annotations": my_pub_list.to_dict_annotation_metadata(), })
def run(parsed_args, job_type): start = time() if job_type in ("normal", "hybrid"): update = update_registry.get("Pub." + process_name(job_type)) if parsed_args.doi: parsed_args.id = clean_doi(parsed_args.doi) parsed_args.doi = None else: update = update_registry.get("DateRange.get_unpaywall_events") # update = update_registry.get("DateRange.get_pmh_events") update.run(**vars(parsed_args)) logger.info(u"finished update in {} seconds".format(elapsed(start))) resp = None if job_type in ("normal", "hybrid"): my_pub = Pub.query.get(parsed_args.id) resp = my_pub.response_jsonb pprint(resp) return resp
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 100) limit = kwargs.get("limit", 10) run_class = Pub run_method = kwargs.get("method") if single_obj_id: limit = 1 queue_table = None elif run_method == "refresh": queue_table = "pub_refresh_queue" if not limit: limit = 1000 text_query_pattern = """ with refresh_queue as ( select id from {queue_table} where started is null order by priority desc, finished nulls first, started, rand limit {chunk} for update skip locked ) update {queue_table} queue_rows_to_update set started = now() from refresh_queue where refresh_queue.id = queue_rows_to_update.id returning refresh_queue.id;""" text_query = text_query_pattern.format( chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) else: queue_table = "pub_queue" if not limit: limit = 1000 text_query_pattern = """WITH update_pub_queue AS ( SELECT id FROM {queue_table} WHERE started is null order by finished asc nulls first LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM update_pub_queue WHERE update_pub_queue.id = queue_rows_to_update.id RETURNING update_pub_queue.id;""" text_query = text_query_pattern.format( limit=limit, chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [run_class.query.filter(run_class.id == single_obj_id).first()] else: logger.info(u"looking for new jobs") job_time = time() row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info(u"got ids, took {} seconds".format(elapsed(job_time))) job_time = time() q = db.session.query(Pub).options(orm.undefer('*')).filter(Pub.id.in_(object_ids)) objects = q.all() logger.info(u"got pub objects in {} seconds".format(elapsed(job_time))) # shuffle them or they sort by doi order random.shuffle(objects) # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # id_rows = db.engine.execute(text(text_query)).fetchall() # ids = [row[0] for row in id_rows] # # job_time = time() # objects = run_class.query.filter(run_class.id.in_(ids)).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) # logger.info(u"finished update_fn") if queue_table: object_ids_str = u",".join([u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str) # logger.info(u"sql command to update finished is: {}".format(sql_command)) run_sql(db, sql_command) # logger.info(u"finished run_sql") # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def run_through_dois(filename=None, reverse=None, loggly=False): total_start = time() i = 0 output_dicts = [] fh = open(filename, "r") lines = fh.readlines() if reverse: logger.info(u"reverse!") lines.reverse() i = -1 * len(lines) dois = [] for line in lines: dois.append(line.strip()) # line = line.replace('"', '') # if u"," in line: # split_line = line.split(",") # if loggly: # dois.append(split_line[1]) # else: # dois.append(split_line[0]) # else: # dois.append(line.strip()) # deduplicate, preserving order duplicated_dois = dois dois = [] for doi in duplicated_dois: if doi not in dois: dois.append(doi) logger.info(u"length of deduped doi list: {}".format(len(dois))) for doi in dois: try: my_doi = clean_doi(doi) except NoDoiException: logger.info(u"bad doi: {}".format(doi)) continue if not my_doi: logger.info(u"bad doi: {}".format(doi)) continue my_pub = Oab.query.get(my_doi) if not my_pub: my_pub = Oab() db.session.add(my_pub) my_pub.id = my_doi my_doi_url = "http://doi.org/{}".format(my_doi) my_doi_url_encoded = urllib.quote_plus(my_doi_url) api_url = "https://api.openaccessbutton.org/availability?url={}".format(my_doi_url_encoded) headers = {"content-type": "application/json"} r = requests.get(api_url, headers=headers) if r.status_code == 200: logger.info(u"success with oab! with {}".format(my_doi)) # logger.info(r.json()) my_pub.api = r.json() flag_modified(my_pub, "api") else: logger.info(u"problem with oab, status_code {}".format(r.status_code)) dissemin_url = "http://dissem.in/api/{}".format(my_doi) r = requests.get(dissemin_url, headers=headers) if r.status_code == 200: logger.info(u"success! with dissemin! with {}".format(my_doi)) # logger.info(r.json()) my_pub.dissemin = r.json() flag_modified(my_pub, "dissemin") else: logger.info(u"problem with dissemin, status_code {}".format(r.status_code)) safe_commit(db) i += 1 logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start, 2))) fh.close()
def save_new_dois(self, rows=1000): headers = { "Accept": "application/json", "User-Agent": "impactstory.org" } base_url_with_last = "http://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&cursor={next_cursor}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates next_cursor = "*" has_more_responses = True num_so_far = 0 num_between_commits = 0 while has_more_responses: start_time = time() url = base_url_with_last.format(first=self.first_day, last=self.last_day, rows=rows, next_cursor=next_cursor) # logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format( elapsed(start_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) return resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: doi = clean_doi(api_raw["DOI"]) my_pub = Pub(id=doi, crossref_api_raw_new=api_raw) # my_pub.title = my_pub.crossref_title # my_pub.normalized_title = normalize_title(my_pub.title) # my_pub.update() db.session.merge(my_pub) num_between_commits += 1 num_so_far += 1 if num_between_commits > 100: # logger.info(u"committing") start_commit = time() safe_commit(db) logger.info(u"committing done in {} seconds".format( elapsed(start_commit, 2))) num_between_commits = 0 # logger.info(u"at bottom of loop, got {} records".format(len(resp_data["items"]))) # make sure to get the last ones logger.info(u"done everything, saving last ones") safe_commit(db) return num_so_far
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c] q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows if row[0]] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois] placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois] responses = pub_responses + placeholder_responses formats = body.get("formats", []) or ["jsonl", "csv"] files = [] if "jsonl" in formats: # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") files.append("output.jsonl") csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] if "csv" in formats: # save csv with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) files.append("output.csv") if "xlsx" in formats: book = Workbook() sheet = book.worksheets[0] sheet.title = "results" for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx+1, row=1, value=field_name) for row_idx, row in enumerate(csv_dicts): for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx+1, row=row_idx+2, value=row[field_name]) book.save(filename="output.xlsx") files.append("output.xlsx") # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, files) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"} root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def fulltext_search_title(original_query, query_entities, oa_only, full=True): start_time = time() original_query_escaped = original_query.replace("'", "''") original_query_with_ands = ' & '.join(original_query_escaped.split(" ")) query_to_use = u"({})".format(original_query_with_ands) if oa_only: oa_clause = u" and is_oa=True " else: oa_clause = " " dois = [] rows = [] search_done = False if is_doi(original_query): dois = [clean_doi(original_query)] search_done = True # if "from_" in original_query and "to_" in original_query: # print u"getting recent query" # matches = re.findall("from_(\d{4}.\d{2}.\d{2})_to_(\d{4}.\d{2}.\d{2})", original_query) # from_date = matches[0][0].replace("_", "-") # to_date = matches[0][1].replace("_", "-") # query_string = u""" # select pmid, 0.05*COALESCE(num_events, 0.0)::float as rank # from search_recent_hits_mv # where published_date > :from_date ::timestamp and published_date < :to_date ::timestamp # and num_events is not null # {oa_clause} # order by num_events desc # limit 100 """.format(oa_clause=oa_clause) # rows = db.engine.execute(sql.text(query_string), from_date=from_date, to_date=to_date).fetchall() # print "done getting query getting pmids" if not search_done and query_entities and len(query_entities)==1: query_entity = query_entities[0] query_entity = query_entity.replace("(", " ") query_entity = query_entity.replace(")", " ") query_entity = query_entity.replace("&", " ") print u"have query_entities" query_string = u""" select doi from search_title_dandelion_simple_mv where title=:query_entity and num_events >= 3 {oa_clause} order by num_events desc limit 120""".format(oa_clause=oa_clause) rows = db.engine.execute(sql.text(query_string), query_entity=query_entity).fetchall() print "done getting query getting dois" original_query_escaped = query_entity.replace("'", "''") original_query_with_ands = ' & '.join(original_query_escaped.split(" ")) query_to_use = u"({})".format(original_query_with_ands) if rows: dois = [row[0] for row in rows] print "len dois", len(dois) if not search_done and len(dois) < 25: print "len(dois) < 25, in fulltext_search_title" # if True: # debug # print "doing full text search anyway" # need to do the full search print "len(dois) < 25, in fulltext_search_title" original_query_escaped = original_query.replace("'", "''") original_query_escaped = original_query_escaped.replace("&", "") original_query_escaped = original_query_escaped.replace("(", " ") original_query_escaped = original_query_escaped.replace(")", " ") original_query_with_ands = ' & '.join([w for w in original_query_escaped.split(" ") if w and w != " "]) query_to_use = u"({})".format(original_query_with_ands) if query_entities: entities_escaped = [] for query_entity in query_entities: print query_entity entity_escaped = query_entity entity_escaped = entity_escaped.replace("'", "''") entity_escaped = entity_escaped.replace("&", "") entity_escaped = entity_escaped.replace("(", "") entity_escaped = entity_escaped.replace(")", "") entity_escaped = u" & ".join(entity_escaped.split(u" ")) entities_escaped += [entity_escaped] print "entities_escaped", entities_escaped entity_with_ands = u' & '.join(entities_escaped) print "entity_with_ands", entity_with_ands query_to_use += u" | ({})".format(entity_with_ands) # get ride of bad characters query_to_use = query_to_use.replace("!", "") print u"starting query for {}".format(query_to_use) query_string = u""" select doi, (ts_rank_cd(to_tsvector('english', article_title), to_tsquery(:query), 1) + 0.05*COALESCE(num_events,0.0)) AS rank FROM ricks_gtr_sort_results WHERE to_tsvector('english', article_title) @@ to_tsquery(:query) and doi is not null {oa_clause} order by rank desc limit 120; """.format(oa_clause=oa_clause) # print query_string rows = db.engine.execute(sql.text(query_string), query=query_to_use).fetchall() print "done getting query of sort data" # print rows dois = [row[0] for row in rows] time_for_dois = elapsed(start_time, 3) print u"done query for dois and sort data: got {} dois".format(len(dois)) time_for_pubs_start_time = time() my_pubs_filtered = [] if dois: if full: query_string = u""" select pmid, doi, article_title, journal_title, pub_types, abstract_length, is_oa, num_events, num_news_events, (ts_rank_cd(to_tsvector('english', article_title), to_tsquery(:query), 1) + 0.05*COALESCE(num_events,0.0)) AS rank from ricks_gtr_sort_results where doi in ({dois_string}) """.format(dois_string=u",".join([u"'{}'".format(str(d)) for d in dois])) # print query_string rows = db.engine.execute(sql.text(query_string), query=query_to_use, dois=dois).fetchall() print "done getting sort data" # print rows # print rows my_pubs_filtered = [] for row in rows: my_dict = { "pmid": row[0], "doi": row[1], "article_title": row[2], "journal_title": row[3], "pub_types": row[4], "abstract_length": row[5], "is_oa": row[6], "num_events": row[7], "num_news_events": row[8], "score": row[9], "query": query_to_use, "query_entities": query_entities } my_dict["adjusted_score"] = adjusted_score(my_dict) my_pubs_filtered.append(my_dict) # my_pubs = db.session.query(Pub).filter(Pub.pmid.in_(pmids)).options(orm.undefer_group('full')).all() # my_pubs = db.session.query(Pub).filter(Pub.pmid.in_(pmids)).\ # options(orm.raiseload(Pub.authors)).\ # options(orm.raiseload(Pub.dandelion_lookup)).\ # options(orm.raiseload(Pub.doi_lookup)).\ # all() else: my_pubs = db.session.query(Pub).filter(Pub.doi.in_(dois)).\ options(orm.raiseload(Pub.authors)).\ options(orm.raiseload(Pub.dandelion_lookup)).\ options(orm.raiseload(Pub.doi_lookup)).\ all() my_pubs_filtered = [p for p in my_pubs if not p.suppress] print "done query for my_pubs" time_for_pubs = elapsed(time_for_pubs_start_time, 3) return (my_pubs_filtered, time_for_dois, time_for_pubs)
def clean_doi(self): if not self.doi: return None return clean_doi(self.doi)
def __init__(self, **kwargs): self.updated = datetime.datetime.utcnow() if "doi" in kwargs: kwargs["doi"] = clean_doi(kwargs["doi"]) super(Chorus, self).__init__(**kwargs)
def get_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000, get_updates=False): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" if get_updates: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}" else: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = insert_pub_fn(pubs_this_chunk) logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = insert_pub_fn(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def get_overrides_dict(): override_dict = defaultdict(dict) # cindy wu example override_dict["10.1038/nature21360"] = { "pdf_url": "https://arxiv.org/pdf/1703.01424.pdf", "version": "submittedVersion" } # example from twitter override_dict["10.1021/acs.jproteome.5b00852"] = { "pdf_url": "http://pubs.acs.org/doi/pdfplus/10.1021/acs.jproteome.5b00852", "host_type_set": "publisher", "version": "publishedVersion" } # have the unpaywall example go straight to the PDF, not the metadata page override_dict["10.1098/rspa.1998.0160"] = { "pdf_url": "https://arxiv.org/pdf/quant-ph/9706064.pdf", "version": "submittedVersion" } # missed, not in BASE, from Maha Bali in email override_dict["10.1080/13562517.2014.867620"] = { "pdf_url": "http://dar.aucegypt.edu/bitstream/handle/10526/4363/Final%20Maha%20Bali%20TiHE-PoD-Empowering_Sept30-13.pdf", "version": "submittedVersion" } # otherwise links to figshare match that only has data, not the article override_dict["110.1126/science.aaf3777"] = {} #otherwise links to a metadata page that doesn't have the PDF because have to request a copy: https://openresearch-repository.anu.edu.au/handle/1885/103608 override_dict["10.1126/science.aad2622"] = { "pdf_url": "https://lra.le.ac.uk/bitstream/2381/38048/6/Waters%20et%20al%20draft_post%20review_v2_clean%20copy.pdf", "version": "submittedVersion" } # otherwise led to http://www.researchonline.mq.edu.au/vital/access/services/Download/mq:39727/DS01 and authorization error override_dict["10.1126/science.aad2622"] = {} # else goes here: http://www.it-c.dk/people/schmidt/papers/complexity.pdf override_dict["10.1007/978-1-84800-068-1_9"] = {} # otherwise led to https://dea.lib.unideb.hu/dea/bitstream/handle/2437/200488/file_up_KMBT36220140226131332.pdf;jsessionid=FDA9F1A60ACA567330A8B945208E3CA4?sequence=1 override_dict["10.1007/978-3-211-77280-5"] = {} # otherwise led to publisher page but isn't open override_dict["10.1016/j.renene.2015.04.017"] = {} # override old-style webpage override_dict["10.1210/jc.2016-2141"] = { "pdf_url": "https://academic.oup.com/jcem/article-lookup/doi/10.1210/jc.2016-2141", "host_type_set": "publisher", "version": "publishedVersion", } # not indexing this location yet, from @rickypo override_dict["10.1207/s15327957pspr0203_4"] = { "pdf_url": "http://www2.psych.ubc.ca/~schaller/528Readings/Kerr1998.pdf", "version": "submittedVersion" } # mentioned in world bank as good unpaywall example override_dict["10.3386/w23298"] = { "pdf_url": "https://economics.mit.edu/files/12774", "version": "submittedVersion" } # from email, has bad citesserx cached version override_dict["10.1007/bf02693740"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.536.6939&rep=rep1&type=pdf", "version": "publishedVersion" } # from email, has bad citesserx cached version override_dict["10.1126/science.1150952"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.3796&rep=rep1&type=pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email, has bad citesserx cached version override_dict["10.1515/eqc.2007.295"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.543.7752&rep=rep1&type=pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1038/nature21377"] = { "pdf_url": "http://eprints.whiterose.ac.uk/112179/1/ppnature21377_Dodd_for%20Symplectic.pdf", "version": "submittedVersion" } # from email override_dict["10.1016/j.gtc.2016.09.007"] = { "pdf_url": "https://cora.ucc.ie/bitstream/handle/10468/3544/Quigley_Chapter.pdf?sequence=1&isAllowed=y", "version": "acceptedVersion" } # stephen hawking's thesis override_dict["10.17863/cam.11283"] = { "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/251038/PR-PHD-05437_CUDL2017-reduced.pdf?sequence=15&isAllowed=y", "version": "publishedVersion" } # from email override_dict["10.1152/advan.00040.2005"] = { "pdf_url": "https://www.physiology.org/doi/pdf/10.1152/advan.00040.2005", "version": "publishedVersion" } # from email override_dict["10.1016/j.chemosphere.2014.07.047"] = { "pdf_url": "https://manuscript.elsevier.com/S0045653514009102/pdf/S0045653514009102.pdf", "version": "submittedVersion" } # from email override_dict["10.4324/9780203900956"] = {} # from email override_dict["10.3810/psm.2010.04.1767"] = { "pdf_url": "http://cupola.gettysburg.edu/cgi/viewcontent.cgi?article=1014&context=healthfac", "version": "publishedVersion" } # from email override_dict["10.1016/S0140-6736(17)33308-1"] = { "pdf_url": "https://www.rug.nl/research/portal/files/64097453/Author_s_version_Gonadotrophins_versus_clomiphene_citrate_with_or_without_intrauterine_insemination_in_women.pdf", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1093/joclec/nhy009"] = { "pdf_url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3126848", "host_type_set": "repository" } # from email override_dict["10.1038/s41477-017-0019-3"] = { "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/270235/3383_1_merged_1502805167.pdf?sequence=1&isAllowed=y", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1029/wr015i006p01633"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.497&rep=rep1&type=pdf", "version": "publishedVersion" } # from email, zenodo override_dict["10.1080/01650521.2018.1460931"] = { "metadata_url": "https://zenodo.org/record/1236622", "host_type_set": "repository", "version": "acceptedVersion" } # from email override_dict["10.3928/01477447-20150804-53"] = {} # from twitter override_dict["10.1103/physreva.97.013421"] = { "pdf_url": "https://arxiv.org/pdf/1711.10074.pdf", "version": "submittedVersion" } # from email override_dict["10.1016/j.amjmed.2005.09.031"] = { "pdf_url": "https://www.amjmed.com/article/S0002-9343(05)00885-5/pdf", "version": "publishedVersion" } # from email override_dict["10.1080/15348458.2017.1327816"] = {} # from chorus override_dict["10.1103/physrevd.94.052011"] = { "pdf_url": "https://link.aps.org/accepted/10.1103/PhysRevD.94.052011", "version": "acceptedVersion", } override_dict["10.1063/1.4962501"] = { "pdf_url": "https://aip.scitation.org/doi/am-pdf/10.1063/1.4962501", "version": "acceptedVersion", "host_type_set": "repository" } # from email, broken citeseer link override_dict["10.2202/1949-6605.1908"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.535.9289&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1561/1500000012"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.174.8814&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1137/s0036142902418680"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.144.7627&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1088/1741-2552/aab4e4"] = { "pdf_url": "http://iopscience.iop.org/article/10.1088/1741-2552/aab4e4/pdf", "version": "publishedVersion" } # from email override_dict["10.1145/1031607.1031615"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.540.8125&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1007/s11227-016-1779-7"] = { "pdf_url": "https://hcl.ucd.ie/system/files/TJS-Hasanov-2016.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/s0020-0190(03)00351-x"] = { "pdf_url": "https://kam.mff.cuni.cz/~kolman/papers/noteb.ps", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1002/14651858.cd001704.pub4"] = { "pdf_url": "https://core.ac.uk/download/pdf/9440822.pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/j.tetlet.2015.04.131"] = { "pdf_url": "https://www.sciencedirect.com/sdfe/pdf/download/read/aam/noindex/pii/S0040403915007881", "version": "acceptedVersion", "host_type_set": "publisher" } # from email override_dict["10.1016/j.nima.2016.04.104"] = { "pdf_url": "http://cds.cern.ch/record/2239750/files/1-s2.0-S0168900216303400-main.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/s1470-2045(15)00444-1"] = { "pdf_url": "https://www.statsarecool.com/data/uploads/journal-articles/who_declares_reds_meat_carcinogeniclancet_oct_2015.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1056/NEJM199406233302502"] = { "pdf_url": "https://www.nejm.org/doi/full/10.1056/NEJM199406233302502", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1056/NEJMra1201534"] = { "pdf_url": "https://www.nejm.org/doi/pdf/10.1056/NEJMra1201534", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1016/j.cmet.2018.03.012"] = { "pdf_url": "https://www.biorxiv.org/content/biorxiv/early/2018/01/15/245332.full.pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1093/sf/65.1.1"] = { "pdf_url": "https://faculty.washington.edu/charles/new%20PUBS/A52.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1088/1751-8121/aabd9c"] = {} # from email override_dict["10.1017/CBO9781139173728.002"] = {} # from email override_dict["10.2174/97816810846711170101"] = {} # from email override_dict["10.1177/1354066196002003001"] = {} # from email override_dict["10.1093/bioinformatics/bty721"] = {} # from email override_dict["10.1088/1361-6528/aac7a4"] = {} # from email override_dict["10.1088/1361-6528/aac645"] = {} # from email override_dict["10.1111/1748-8583.12159"] = {} # from email override_dict["10.1042/BJ20080963"] = {} # from email override_dict["10.1136/bmj.j5007"] = {} # from email override_dict["10.1016/j.phrs.2017.12.007"] = {} # from email override_dict["10.4324/9781315770185"] = {} # from email override_dict["10.1108/PIJPSM-02-2016-0019"] = {} # from email override_dict["10.1016/j.ejca.2017.07.015"] = {} # from email override_dict["10.1080/14655187.2017.1469322"] = {} # from email override_dict["10.1080/02684527.2017.1407549"] = {} # from email override_dict["10.1093/jat/bky025"] = {} # from email override_dict["10.1016/j.midw.2009.07.004"] = {} # from email override_dict["10.1177/247553031521a00105"] = {} # from email override_dict["10.1002/0471445428"] = {} # from email override_dict["10.1007/978-3-642-31232-8"] = {} # ticket 267 override_dict["10.1016/j.anucene.2014.08.021"] = {} # ticket 199 # pdf has embedded password protection override_dict["10.22381/rcp1720184"] = {} # ticket 256 # journal in doaj but article not available override_dict["10.1016/j.mattod.2018.03.001"] = {} # ticket 277 # pmh record with spurious title: oai:works.swarthmore.edu:fac-psychology-1039 override_dict["10.1016/j.actpsy.2010.01.009"] = {} # ticket 280 # green scrape gets overexcited about a .doc link override_dict["10.1108/09596111211217932"] = {} # ticket 279 # match to wrong pdf, currently suppressed incorrectly by bad pdf check override_dict["10.1238/physica.topical.102a00059"] = {} # from email override_dict["10.1016/S0022-1996(00)00093-3"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.3874&rep=rep1&type=pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1177/088840649401700203"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1014.8577&rep=rep1&type=pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.7326/L18-0139"] = { "pdf_url": "http://annals.org/data/journals/aim/936928/aime201804170-l180139.pdf", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1007/978-3-319-48881-3_55"] = { "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:1063949/FULLTEXT01.pdf", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1109/ICCVW.2015.86"] = { "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:917646/FULLTEXT01", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1126/science.aap9559"] = { "pdf_url": "http://vermontcomplexsystems.org/share/papershredder/vosoughi2018a.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1109/tpds.2012.97"] = { "pdf_url": "https://www.cnsr.ictas.vt.edu/publication/06171175.pdf", "version": "publishedVersion", "host_type_set": "repository" } # ticket 261 # crossref metadata points to wrong article override_dict["10.4149/BLL_2013_058"] = { "pdf_url": "http://www.elis.sk/download_file.php?product_id=3759&session_id=lnkeo437s8hv5t0r28g6ku93b0", "version": "publishedVersion", "host_type_set": "publisher" } # ticket 200 # we forgot to say the magic word override_dict["10.1007/s11465-016-0392-z"] = { "pdf_url": "https://cora.ucc.ie/bitstream/10468/4112/1/2986.pdf?&isAllowed=y", "version": "publishedVersion", "host_type_set": "repository" } # the use of this is counting on the doi keys being lowercase/cannonical response = {} for k, v in override_dict.iteritems(): response[clean_doi(k)] = v return response
def call_pmh_endpoint(self, first=None, last=None, chunk_size=10, scrape=False): args = {} args['metadataPrefix'] = 'oai_dc' if "citeseerx" in self.pmh_url: proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url, "http": proxy_url} else: proxies = {} my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120) logger.info(u"connected to sickle with {} {}".format( self.pmh_url, proxies)) args['from'] = first if last: args["until"] = last records_to_save = [] logger.info(u"calling ListRecords with {} {}".format( self.pmh_url, args)) try: pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args) logger.info(u"got pmh_records with {} {}".format( self.pmh_url, args)) pmh_input_record = safe_get_next_record(pmh_records) except Exception as e: logger.info(u"no records with {} {}".format(self.pmh_url, args)) # logger.exception(u"no records with {} {}".format(self.pmh_url, args)) pmh_input_record = None while pmh_input_record: my_pmh_record = pmh_record.PmhRecord() my_pmh_record.id = pmh_input_record.header.identifier my_pmh_record.api_raw = pmh_input_record.raw my_pmh_record.record_timestamp = pmh_input_record.header.datestamp my_pmh_record.title = oai_tag_match("title", pmh_input_record) my_pmh_record.authors = oai_tag_match("creator", pmh_input_record, return_list=True) my_pmh_record.oa = oai_tag_match("oa", pmh_input_record) my_pmh_record.urls = oai_tag_match("identifier", pmh_input_record, return_list=True) for fulltext_url in my_pmh_record.urls: if fulltext_url and (is_doi_url(fulltext_url) or fulltext_url.startswith(u"doi:") or re.findall(u"10\.", fulltext_url)): try: my_pmh_record.doi = clean_doi(fulltext_url) except NoDoiException: pass my_pmh_record.license = oai_tag_match("rights", pmh_input_record) my_pmh_record.relations = oai_tag_match("relation", pmh_input_record, return_list=True) my_pmh_record.sources = oai_tag_match("collname", pmh_input_record, return_list=True) my_pmh_record.source = self.id if is_complete(my_pmh_record): db.session.merge(my_pmh_record) my_pages = my_pmh_record.mint_pages() logger.info(u"made {} pages for id {}".format( len(my_pages), my_pmh_record.id)) for my_page in my_pages: if scrape: logger.info(u"scraping pages") my_page.scrape() db.session.merge(my_page) records_to_save.append(my_pmh_record) # logger.info(u":") logger.info(u"my_pmh_record {}".format( my_pmh_record.get_good_urls())) else: logger.info(u"not complete") if len(records_to_save) >= chunk_size: last_record = records_to_save[-1] logger.info(u"last record saved: {} for {}".format( last_record.id, self.id)) safe_commit(db) records_to_save = [] pmh_input_record = safe_get_next_record(pmh_records) # make sure to get the last ones if records_to_save: last_record = records_to_save[-1] logger.info( u"saving {} last ones, last record saved: {} for {}".format( len(records_to_save), last_record.id, self.id)) safe_commit(db) logger.info(u"done everything for {}".format(self.id))
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 100) limit = kwargs.get("limit", 10) run_class = Pub run_method = kwargs.get("method") if single_obj_id: limit = 1 queue_table = None elif run_method == "refresh": queue_table = "pub_refresh_queue" if not limit: limit = 1000 text_query_pattern = """ with refresh_queue as ( select id from {queue_table} where started is null order by priority desc, finished nulls first, started, rand limit {chunk} for update skip locked ) update {queue_table} queue_rows_to_update set started = now() from refresh_queue where refresh_queue.id = queue_rows_to_update.id returning refresh_queue.id;""" text_query = text_query_pattern.format(chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) else: queue_table = "pub_queue" if not limit: limit = 1000 text_query_pattern = """WITH update_pub_queue AS ( SELECT id FROM {queue_table} WHERE started is null order by finished asc nulls first LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM update_pub_queue WHERE update_pub_queue.id = queue_rows_to_update.id RETURNING update_pub_queue.id;""" text_query = text_query_pattern.format(limit=limit, chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [ run_class.query.filter( run_class.id == single_obj_id).first() ] else: logger.info(u"looking for new jobs") job_time = time() row_list = db.engine.execute( text(text_query).execution_options( autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info(u"got ids, took {} seconds".format( elapsed(job_time))) job_time = time() q = db.session.query(Pub).options(orm.undefer('*')).filter( Pub.id.in_(object_ids)) objects = q.all() logger.info(u"got pub objects in {} seconds".format( elapsed(job_time))) # shuffle them or they sort by doi order random.shuffle(objects) # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # id_rows = db.engine.execute(text(text_query)).fetchall() # ids = [row[0] for row in id_rows] # # job_time = time() # objects = run_class.query.filter(run_class.id.in_(ids)).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) # logger.info(u"finished update_fn") if queue_table: object_ids_str = u",".join([ u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids ]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str) # logger.info(u"sql command to update finished is: {}".format(sql_command)) run_sql(db, sql_command) # logger.info(u"finished run_sql") # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def get_chorus_data(starting_offset=0, agency_id=None): requests_session = requests.Session() retries = Retry(total=10, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) agencies = get_chorus_agencies() for agency in agencies: if agency_id: if int(agency["Agency_Id"]) != int(agency_id): print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"]) continue if starting_offset: offset = starting_offset else: offset = 0 logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"])) url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}" limit = 50 total_results = None while total_results==None or offset < total_results: loop_start = time() url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit) print url try: r = requests_session.get(url, timeout=360) # wait for 3 minutes except Exception, e: logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8"))) r = None print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1)) offset += limit if r: data = r.json() total_results = data["total_results"] logger.info(u"Has {} total results, {} remaining".format( total_results, total_results - offset)) items = data["items"] new_objects = [] for item in items: if item["DOI"]: doi = clean_doi(item["DOI"]) new_objects.append(Chorus(id=doi, raw=item)) ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()] objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db] if objects_to_add_to_db: logger.info(u"adding {} items".format(len(objects_to_add_to_db))) db.session.add_all(objects_to_add_to_db) safe_commit(db) else: logger.info(u"all of these items already in db") logger.info(u"sleeping for 2 seconds") sleep(2)