Beispiel #1
0
def download_attachments(dirpath, doc_json, documentId):
    """
    Download the other attachments for the document
    :param dirpath: path to the directory where the download will be saved
    :param doc_json: the json from a single document api call
    :param documentId: the string of a documentId
    :return: the total number of requests used to download the extra attachments
    """
    total_requests = 0
    try:
        extra_attachments = doc_json["attachments"]
        total_requests += len(extra_attachments)
        for attachment in extra_attachments:
            attachment_formats = attachment["fileFormats"]
            for a_format in attachment_formats:
                here = str(a_format).index("contentType") + 12
                type = str(a_format)[here:]
                result = api_call_manager(add_api_key(str(a_format)))
                download_document(dirpath, documentId, result, type)
    except KeyError:
        pass
    except CallFailException:
        logger.error('Error - Call failed')
        pass
    return total_requests
Beispiel #2
0
def do_work():
    """
    Working loop
    Get work - Determine type of work - Do work - Return work
    If there is no work in the server, sleep for an hour
    :return:
    """

    while True:
        try:
            work = get_work(client_id)
            requests.get(client_health_url)
            work_json = json.loads(work.content.decode('utf-8'))
        except man.CallFailException:
            time.sleep(3600)
        if work_json["type"] == "doc":
            r = return_doc(work_json, client_id)
            requests.get(client_health_url)
        elif work_json["type"] == "docs":
            r = return_docs(work_json, client_id)
            requests.get(client_health_url)
        elif work_json["type"] == "none":
            time.sleep(3600)
            requests.get(client_health_url)
        else:
            logger.error('Job type unexpected')
            requests.get(client_health_url + "/fail")
Beispiel #3
0
def documents_processor(urls, job_id, client_id):
    """
    Call each url in the list, process the results of the calls and then form a json file to send back the results
    :param urls: list of urls that have to be called
    :param job_id: the id of the job that is being worked on currently
    :param client_id: id of the client calling this function
    :return result: the json to be returned to the server after each call is processed
    """
    global workfiles
    workfiles = []
    for url in urls:
        try:
            result = api_call_manager(add_api_key(url))
            process_results(result)
        except:
            logger.error('Error - URL processing failed')
    result = json.loads(
        json.dumps({
            "job_id": job_id,
            "type": "docs",
            "data": workfiles,
            "client_id": str(client_id),
            "version": version
        }))
    return result
Beispiel #4
0
def document_processor(doc_ids):
    """
    This process takes all of the document ids given to it and saves all of the data for the documents in a temporary directory.
    :param doc_ids: list of document ids that have to be collected.
    :return: temporary directory that data was written to.
    """
    dirpath = tempfile.TemporaryDirectory()
    for doc_id in doc_ids:
        try:
            result = api_call_manager(add_api_key(make_doc_url(doc_id)))
            total = get_extra_documents(result, dirpath.name, doc_id)
        except CallFailException:
            logger.error('Doc ID error')
    return dirpath
Beispiel #5
0
def api_call_manager(url):
    """
    If there were no errors in making an API call, get the result
    If a Temporary error occurred, sleep for 5 minutes and try again. Do this 50 times, and if it continues to fail, raise a CallFailException
    If a Permanent error occurs, raise a CallFailException
    If the user's ApiCount is zero, sleep for one hour to refresh the calls
    :param url: the url that will be used to make the API call
    :return: returns the resulting information of the documents
    """

    pause = 0
    while pause < 51:
        try:
            result = call(url)
            return result
        except TemporaryException:
            logger.error('Error: waiting 5 minutes...')
            time.sleep(300)
            pause += 1
        except PermanentException:
            logger.error('Error with the API call')
            break
        except ApiCountZeroException:
            logger.error('Error: ran out of API calls')
            time.sleep(3600)
    logger.error('API call failed...')
    raise CallFailException
Beispiel #6
0
def return_doc():
    """
    The endpoint the client calls to return documents they received from the individual regulations doc calls
    :return: Returns a string saying successful so the client knows the call was successful
    """

    try:
        files = request.files['file'].read()
        json_info = request.form['json']
    except:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_doc(redis_server(), json.loads(json_info), files)
    return 'Successful!'
Beispiel #7
0
def monolith():
    """
    Runs the script. This is one monolithic function (aptly named) as the script just needs to be run; however, there is a certain
    point where I need to break out of the program if an error occurs, and I wasn't sure how exactly sys.exit() would work and whether
    or not it would mess with things outside of / calling this script, so I just made one giant method so I can return when needed.
    :return:
    """
    url_base = "https://api.data.gov/regulations/v3/documents.json?rpp=1000"
    r = redis_manager.RedisManager(redis.Redis())
    regulations_key = config.read_value('key')
    current_page = 0

    if regulations_key != "":
        # Gets number of documents available to download
        try:
            record_count = requests.get(
                "https://api.data.gov/regulations/v3/documents.json?api_key=" +
                regulations_key + "&countsOnly=1").json()["totalNumRecords"]
        except:
            logger.error('Error occured with API request')
            print("Error occurred with docs_work_gen regulations API request.")
            return 0

        # Gets the max page we'll go to; each page is 1000 documents
        max_page_hit = record_count // 1000

        # This loop generates lists of URLs, sending out a job and writing them to the work server every 1000 URLs.
        # It will stop and send whatever's left if we hit the max page limit.
        while current_page < max_page_hit:
            url_list = []
            for i in range(1000):
                current_page += 1
                url_full = url_base + "&po=" + str(current_page * 1000)

                url_list.append(url_full)

                if current_page == max_page_hit:
                    break

            # Makes a JSON from the list of URLs and send it to the queue as a job
            docs_work = [
                ''.join(
                    random.choices(string.ascii_letters + string.digits,
                                   k=16)), "docs", url_list
            ]
            r.add_to_queue(endpoints.generate_json(docs_work))
    else:
        print("No API Key!")
Beispiel #8
0
def process_results(result):
    """
    Loads the json from the results of the api call
    Gets the list of documents from the json
    Creates a new json that contains the documents returned from each api call
    :param result: Result of the api call
    :return: returns True if the processing completed successfully
    """
    docs_json = json.loads(result.text)
    try:
        doc_list = docs_json["documents"]
        work = make_docs(doc_list)
    except TypeError:
        logger.error('Error - bad JSON')

    return True
Beispiel #9
0
def return_docs():
    """
    The endpoint the client calls to return the document ids received from the regulations docs calls
    :return: Returns a string saying successful so the client knows the call was successful
    """
    try:
        json_info = request.form['json']
        files = request.files['file'].read()
    except:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    if json_info is None:
        logger.error('Error - could not post docs')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_docs(redis_server(), json.loads(json_info), files)
    return 'Successful!'
Beispiel #10
0
def get_work():
    """
    Endpoint the user will use to get work from the queue
    client_id will be one of the parameters given for logging purposes
    :return: Returns the json containing the job_id, the type of work to be done, the work that nees to be done, and
    the version number
    """
    logger.warning("Successful API Call: %s", 'get_work: get_work')
    if len(request.args) != 1:
        logger.error('Error - number of parameters incorrect')
        return 'Parameter Missing', 400
    client_id = request.args.get('client_id')
    if client_id is None:
        logger.warning("Exception: %s",
                       'get_work: BadParameterException, client id was none')
        logger.error('Error - no client ID')
        return 'Bad Parameter', 400
    json_info = redis_server().get_work()
    return json.dumps(json_info)
Beispiel #11
0
def read_value(value):
    """
    Reads a file from the configuration JSON file.
    :param value: Value to be read from the JSON
    :return: Value read from the JSON
    """
    try:
        configurationpath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../config.json")
        contents = json.loads(open(configurationpath, "r").read())
        result = contents[value]
    except FileNotFoundError:
        logger.error('File Not Found Error')
        return None
    except IOError:
        logger.error('Input/Output Error')
        return None
    except json.JSONDecodeError:
        logger.error('JSON Decode Error')
        return None
    except KeyError:
        logger.error('Key Error')
        return None
    else:
        return result
Beispiel #12
0
 def __init__(self):
     logger.error('Error with the API call')
Beispiel #13
0
 def __init__(self):
     logger.error('Error - ran out of API calls')
Beispiel #14
0
 def __init__(self):
     logger.error('Error connecting to API')