def call_requests_get(url, headers={}, read_timeout=60, connect_timeout=60, stream=False, publisher=None, session_id=None, ask_slowly=False): following_redirects = True num_redirects = 0 while following_redirects: requests_session = requests.Session() if ask_slowly: retries = Retry(total=1, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) else: retries = Retry(total=0, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) if u"citeseerx.ist.psu.edu/" in url: url = url.replace("http://", "https://") proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url, "http": proxy_url} else: proxies = {} # logger.info(u"getting url {}".format(url)) r = requests_session.get(url, headers=headers, timeout=(connect_timeout, read_timeout), stream=stream, proxies=proxies, allow_redirects=True, verify=False) # from http://jakeaustwick.me/extending-the-requests-response-class/ for method_name, method in inspect.getmembers(RequestWithFileDownload, inspect.ismethod): setattr(requests.models.Response, method_name, method.im_func) if r and not r.encoding: r.encoding = "utf-8" # check to see if we actually want to keep redirecting, using business-logic redirect paths following_redirects = False num_redirects += 1 if (r.status_code == 200) and (num_redirects < 5): redirect_url = keep_redirecting(r, publisher) if redirect_url: following_redirects = True url = redirect_url return r
def get_response_page(url): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} requests_session = requests.Session() retries = Retry(total=10, backoff_factor=1, status_forcelist=[413, 429, 500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) r = requests_session.get(url, headers=headers, timeout=(180, 180)) return r
def get_chorus_data(starting_offset=0, agency_id=None): requests_session = requests.Session() retries = Retry(total=10, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) agencies = get_chorus_agencies() for agency in agencies: if agency_id: if int(agency["Agency_Id"]) != int(agency_id): print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"]) continue if starting_offset: offset = starting_offset else: offset = 0 logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"])) url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}" limit = 50 total_results = None while total_results==None or offset < total_results: loop_start = time() url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit) print url try: r = requests_session.get(url, timeout=360) # wait for 3 minutes except Exception, e: logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8"))) r = None print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1)) offset += limit if r: data = r.json() total_results = data["total_results"] logger.info(u"Has {} total results, {} remaining".format( total_results, total_results - offset)) items = data["items"] new_objects = [] for item in items: if item["DOI"]: doi = clean_doi(item["DOI"]) new_objects.append(Chorus(id=doi, raw=item)) ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()] objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db] if objects_to_add_to_db: logger.info(u"adding {} items".format(len(objects_to_add_to_db))) db.session.add_all(objects_to_add_to_db) safe_commit(db) else: logger.info(u"all of these items already in db") logger.info(u"sleeping for 2 seconds") sleep(2)