def download_attachments(dirpath, doc_json, documentId): """ Download the other attachments for the document :param dirpath: path to the directory where the download will be saved :param doc_json: the json from a single document api call :param documentId: the string of a documentId :return: the total number of requests used to download the extra attachments """ total_requests = 0 try: extra_attachments = doc_json["attachments"] total_requests += len(extra_attachments) for attachment in extra_attachments: attachment_formats = attachment["fileFormats"] for a_format in attachment_formats: here = str(a_format).index("contentType") + 12 type = str(a_format)[here:] result = api_call_manager(add_api_key(str(a_format))) download_document(dirpath, documentId, result, type) except KeyError: pass except CallFailException: logger.error('Error - Call failed') pass return total_requests
def do_work(): """ Working loop Get work - Determine type of work - Do work - Return work If there is no work in the server, sleep for an hour :return: """ while True: try: work = get_work(client_id) requests.get(client_health_url) work_json = json.loads(work.content.decode('utf-8')) except man.CallFailException: time.sleep(3600) if work_json["type"] == "doc": r = return_doc(work_json, client_id) requests.get(client_health_url) elif work_json["type"] == "docs": r = return_docs(work_json, client_id) requests.get(client_health_url) elif work_json["type"] == "none": time.sleep(3600) requests.get(client_health_url) else: logger.error('Job type unexpected') requests.get(client_health_url + "/fail")
def documents_processor(urls, job_id, client_id): """ Call each url in the list, process the results of the calls and then form a json file to send back the results :param urls: list of urls that have to be called :param job_id: the id of the job that is being worked on currently :param client_id: id of the client calling this function :return result: the json to be returned to the server after each call is processed """ global workfiles workfiles = [] for url in urls: try: result = api_call_manager(add_api_key(url)) process_results(result) except: logger.error('Error - URL processing failed') result = json.loads( json.dumps({ "job_id": job_id, "type": "docs", "data": workfiles, "client_id": str(client_id), "version": version })) return result
def document_processor(doc_ids): """ This process takes all of the document ids given to it and saves all of the data for the documents in a temporary directory. :param doc_ids: list of document ids that have to be collected. :return: temporary directory that data was written to. """ dirpath = tempfile.TemporaryDirectory() for doc_id in doc_ids: try: result = api_call_manager(add_api_key(make_doc_url(doc_id))) total = get_extra_documents(result, dirpath.name, doc_id) except CallFailException: logger.error('Doc ID error') return dirpath
def api_call_manager(url): """ If there were no errors in making an API call, get the result If a Temporary error occurred, sleep for 5 minutes and try again. Do this 50 times, and if it continues to fail, raise a CallFailException If a Permanent error occurs, raise a CallFailException If the user's ApiCount is zero, sleep for one hour to refresh the calls :param url: the url that will be used to make the API call :return: returns the resulting information of the documents """ pause = 0 while pause < 51: try: result = call(url) return result except TemporaryException: logger.error('Error: waiting 5 minutes...') time.sleep(300) pause += 1 except PermanentException: logger.error('Error with the API call') break except ApiCountZeroException: logger.error('Error: ran out of API calls') time.sleep(3600) logger.error('API call failed...') raise CallFailException
def return_doc(): """ The endpoint the client calls to return documents they received from the individual regulations doc calls :return: Returns a string saying successful so the client knows the call was successful """ try: files = request.files['file'].read() json_info = request.form['json'] except: logger.error('Error - bad parameter') return 'Bad Parameter', 400 files = io.BytesIO(files) process_doc(redis_server(), json.loads(json_info), files) return 'Successful!'
def monolith(): """ Runs the script. This is one monolithic function (aptly named) as the script just needs to be run; however, there is a certain point where I need to break out of the program if an error occurs, and I wasn't sure how exactly sys.exit() would work and whether or not it would mess with things outside of / calling this script, so I just made one giant method so I can return when needed. :return: """ url_base = "https://api.data.gov/regulations/v3/documents.json?rpp=1000" r = redis_manager.RedisManager(redis.Redis()) regulations_key = config.read_value('key') current_page = 0 if regulations_key != "": # Gets number of documents available to download try: record_count = requests.get( "https://api.data.gov/regulations/v3/documents.json?api_key=" + regulations_key + "&countsOnly=1").json()["totalNumRecords"] except: logger.error('Error occured with API request') print("Error occurred with docs_work_gen regulations API request.") return 0 # Gets the max page we'll go to; each page is 1000 documents max_page_hit = record_count // 1000 # This loop generates lists of URLs, sending out a job and writing them to the work server every 1000 URLs. # It will stop and send whatever's left if we hit the max page limit. while current_page < max_page_hit: url_list = [] for i in range(1000): current_page += 1 url_full = url_base + "&po=" + str(current_page * 1000) url_list.append(url_full) if current_page == max_page_hit: break # Makes a JSON from the list of URLs and send it to the queue as a job docs_work = [ ''.join( random.choices(string.ascii_letters + string.digits, k=16)), "docs", url_list ] r.add_to_queue(endpoints.generate_json(docs_work)) else: print("No API Key!")
def process_results(result): """ Loads the json from the results of the api call Gets the list of documents from the json Creates a new json that contains the documents returned from each api call :param result: Result of the api call :return: returns True if the processing completed successfully """ docs_json = json.loads(result.text) try: doc_list = docs_json["documents"] work = make_docs(doc_list) except TypeError: logger.error('Error - bad JSON') return True
def return_docs(): """ The endpoint the client calls to return the document ids received from the regulations docs calls :return: Returns a string saying successful so the client knows the call was successful """ try: json_info = request.form['json'] files = request.files['file'].read() except: logger.error('Error - bad parameter') return 'Bad Parameter', 400 if json_info is None: logger.error('Error - could not post docs') return 'Bad Parameter', 400 files = io.BytesIO(files) process_docs(redis_server(), json.loads(json_info), files) return 'Successful!'
def get_work(): """ Endpoint the user will use to get work from the queue client_id will be one of the parameters given for logging purposes :return: Returns the json containing the job_id, the type of work to be done, the work that nees to be done, and the version number """ logger.warning("Successful API Call: %s", 'get_work: get_work') if len(request.args) != 1: logger.error('Error - number of parameters incorrect') return 'Parameter Missing', 400 client_id = request.args.get('client_id') if client_id is None: logger.warning("Exception: %s", 'get_work: BadParameterException, client id was none') logger.error('Error - no client ID') return 'Bad Parameter', 400 json_info = redis_server().get_work() return json.dumps(json_info)
def read_value(value): """ Reads a file from the configuration JSON file. :param value: Value to be read from the JSON :return: Value read from the JSON """ try: configurationpath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../config.json") contents = json.loads(open(configurationpath, "r").read()) result = contents[value] except FileNotFoundError: logger.error('File Not Found Error') return None except IOError: logger.error('Input/Output Error') return None except json.JSONDecodeError: logger.error('JSON Decode Error') return None except KeyError: logger.error('Key Error') return None else: return result
def __init__(self): logger.error('Error with the API call')
def __init__(self): logger.error('Error - ran out of API calls')
def __init__(self): logger.error('Error connecting to API')