def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [ c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c ] q = db.session.query(pub.Pub.response_jsonb).filter( pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows if row[0]] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [ d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois ] placeholder_responses = [ pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois ] responses = pub_responses + placeholder_responses # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") # save csv csv_dicts = [ pub.csv_dict_from_response_dict(my_dict) for my_dict in responses ] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, ["output.csv", "output.jsonl"]) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def add_pubs_from_dois(dois): new_pubs = [] for doi in dois: crossref_api = get_api_for_one_doi(doi) new_pub = build_new_pub(doi, crossref_api) # hack so it gets updated soon new_pub.updated = datetime.datetime(1042, 1, 1) new_pubs.append(new_pub) added_pubs = add_new_pubs(new_pubs) return added_pubs
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c] q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois] placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois] responses = pub_responses + placeholder_responses # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") # save csv csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, ["output.csv", "output.jsonl"]) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c] q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois)) rows = q.all() pub_responses = [row[0] for row in rows if row[0]] pub_dois = [r['doi'] for r in pub_responses] missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois] placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois] responses = pub_responses + placeholder_responses formats = body.get("formats", []) or ["jsonl", "csv"] files = [] if "jsonl" in formats: # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") files.append("output.jsonl") csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] if "csv" in formats: # save csv with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) files.append("output.csv") if "xlsx" in formats: book = Workbook() sheet = book.worksheets[0] sheet.title = "results" for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx+1, row=1, value=field_name) for row_idx, row in enumerate(csv_dicts): for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx+1, row=row_idx+2, value=row[field_name]) book.save(filename="output.xlsx") files.append("output.xlsx") # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, files) send(email, for_real=True) return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
def get_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000, get_updates=False): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" if get_updates: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}" else: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = insert_pub_fn(pubs_this_chunk) logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = insert_pub_fn(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"} root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): i = 0 records_to_save = [] # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = { "Accept": "application/json", "User-Agent": "mailto:[email protected]" } root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() first = (datetime.date.today() - datetime.timedelta(days=7)).isoformat() elif today: last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() first = (datetime.date.today() - datetime.timedelta(days=2)).isoformat() if not first: first = "2016-04-01" start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first, last=last, next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first, next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format( elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info( u"added {} pubs, loop done in {} seconds".format( len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] loop_time = time() logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info( u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))