Esempio n. 1
0
def download_attachments(dirpath, doc_json, documentId):
    """
    Download the other attachments for the document
    :param dirpath: path to the directory where the download will be saved
    :param doc_json: the json from a single document api call
    :param documentId: the string of a documentId
    :return: the total number of requests used to download the
             extra attachments
    """
    total_requests = 0
    try:
        extra_attachments = doc_json['attachments']
        total_requests += len(extra_attachments)
        for attachment in extra_attachments:
            attachment_formats = attachment['fileFormats']
            for a_format in attachment_formats:
                time.sleep(30)
                here = str(a_format).index('contentType') + 12
                type = str(a_format)[here:]
                result = api_call_manager(client_add_api_key(str(a_format)))
                download_document(dirpath, documentId, result, type)
    except KeyError:
        pass
    except CallFailException:
        logger.error('Error - API call failed')
        pass
    return total_requests
Esempio n. 2
0
def do_work():
    """
    Working loop
    Get work - Determine type of work - Do work - Return work
    If there is no work in the server, sleep for an hour
    :return:
    """

    ip = config.client_read_value('ip')
    port = config.client_read_value('port')
    client_id = config.client_read_value('client id')

    server_url = 'http://' + ip + ':' + port

    try:
        work = get_work(server_url, client_id)
        requests.get(client_health_url)
        work_json = json.loads(work.content.decode('utf-8'))
    except man.CallFailException:
        time.sleep(3600)
        return None
    if work_json['type'] == 'doc':
        r = return_doc(work_json, server_url, client_id)
        requests.get(client_health_url)
    elif work_json['type'] == 'docs':
        r = return_docs(work_json, server_url, client_id)
        requests.get(client_health_url)
    elif work_json['type'] == 'none':
        time.sleep(3600)
        requests.get(client_health_url)
    else:
        logger.error('Error - Job type unexpected')
        requests.get(client_health_url + '/fail')
Esempio n. 3
0
def documents_processor(urls, job_id, client_id):
    """
    Call each url in the list, process the results of the calls and then form a json file to send back the results
    :param urls: list of urls that have to be called
    :param job_id: the id of the job that is being worked on currently
    :param client_id: id of the client calling this function
    :return result: the json to be returned to the server after each call is processed
    """
    global workfiles
    workfiles = []
    for url in urls:
        try:
            result = api_call_manager(add_api_key(url))
            process_results(result)
        except:
            logger.error('Error - URL processing failed')
    result = json.loads(
        json.dumps({
            "job_id": job_id,
            "type": "docs",
            "data": workfiles,
            "client_id": str(client_id),
            "version": version
        }))
    return result
Esempio n. 4
0
def do_work():
    """
    Working loop
    Get work - Determine type of work - Do work - Return work
    If there is no work in the server, sleep for an hour
    :return:
    """

    while True:
        try:
            work = get_work(client_id)
            requests.get(client_health_url)
            work_json = json.loads(work.content.decode('utf-8'))
        except man.CallFailException:
            time.sleep(3600)
        if work_json["type"] == "doc":
            r = return_doc(work_json, client_id)
            requests.get(client_health_url)
        elif work_json["type"] == "docs":
            r = return_docs(work_json, client_id)
            requests.get(client_health_url)
        elif work_json["type"] == "none":
            time.sleep(3600)
            requests.get(client_health_url)
        else:
            logger.error('Job type unexpected')
            requests.get(client_health_url + "/fail")
Esempio n. 5
0
def documents_processor(urls, job_id, client_id):
    """
    Call each url in the list, process the results of
    the calls and then form a json file to send back the results
    :param urls: list of urls that have to be called
    :param job_id: the id of the job that is being worked on currently
    :param client_id: id of the client calling this function
    :return result: the json to be returned
            to the server after each call is processed
    """
    global workfiles
    workfiles = []
    for url in urls:
        try:
            result = api_call_manager(client_add_api_key(url))
            process_results(result)
        except Exception:
            logger.error('Error - URL processing failed')

    return {
        'job_id': job_id,
        'type': 'docs',
        'data': workfiles,
        'client_id': client_id,
        'version': version
    }
Esempio n. 6
0
def monolith():
    """
    Runs the script. This is one monolithic function (aptly named)
    as the script just needs to be run; however, there is a certain
    point where I need to break out of the program if an error occurs,
    and I wasn't sure how exactly sys.exit() would work and whether
    or not it would mess with things outside of / calling this script,
    so I just made one giant method so I can return when needed.
    :return:
    """
    url_base = 'https://api.data.gov/regulations/v3/documents.json?rpp=1000'
    r = redis_manager.RedisManager()
    regulations_key = config.server_read_value('api key')
    current_page = 0

    if regulations_key != '':
        # Gets number of documents available to download
        try:
            url = \
                'https://api.data.gov/regulations/v3/documents.json?api_key=' \
                + regulations_key + '&countsOnly=1'
            record_count = \
                api_manager.api_call_manager(url).json()['totalNumRecords']
        except api_manager.CallFailException:
            logger.error('Error occured with API request')
            print('Error occurred with docs_work_gen regulations API request.')
            return 0

        # Gets the max page we'll go to; each page is 1000 documents
        max_page_hit = record_count // 1000

        # This loop generates lists of URLs, sending out a job and
        # writing them to the work server every 1000 URLs.
        # It will stop and send whatever's left if we hit the max page limit.
        while current_page < max_page_hit:
            url_list = []
            for i in range(1000):
                current_page += 1
                url_full = url_base + '&po=' + str(current_page * 1000)

                url_list.append(url_full)

                if current_page == max_page_hit:
                    break

            # Makes a JSON from the list of URLs and send
            # it to the queue as a job
            docs_work = [
                ''.join(
                    random.choices(string.ascii_letters + string.digits,
                                   k=16)), 'docs', url_list
            ]
            r.add_to_queue(endpoints.generate_json(docs_work))
    else:
        print('No API Key!')
Esempio n. 7
0
def read_value(value):
    """
    Reads a file from the configuration JSON file.
    :param value: Value to be read from the JSON
    :return: Value read from the JSON
    """
    try:
        configurationpath = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), "../../config.json")
        contents = json.loads(open(configurationpath, "r").read())
        result = contents[value]
    except FileNotFoundError:
        logger.error('File Not Found Error')
        return None
    except IOError:
        logger.error('Input/Output Error')
        return None
    except json.JSONDecodeError:
        logger.error('JSON Decode Error')
        return None
    except KeyError:
        logger.error('Key Error')
        return None
    else:
        return result
Esempio n. 8
0
def document_processor(doc_ids):
    """
    This process takes all of the document ids given to it and saves all of the data for the documents in a temporary directory.
    :param doc_ids: list of document ids that have to be collected.
    :return: temporary directory that data was written to.
    """
    dirpath = tempfile.TemporaryDirectory()
    for doc_id in doc_ids:
        try:
            result = api_call_manager(add_api_key(make_doc_url(doc_id)))
            total = get_extra_documents(result, dirpath.name, doc_id)
        except CallFailException:
            logger.error('Doc ID error')
    return dirpath
Esempio n. 9
0
def return_doc():
    """
    The endpoint the client calls to return documents they received from the individual regulations doc calls
    :return: Returns a string saying successful so the client knows the call was successful
    """

    try:
        files = request.files['file'].read()
        json_info= request.form['json']
    except:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_doc(redis_server(), json.loads(json_info), files)
    return 'Successful!'
Esempio n. 10
0
def process_results(result):
    """
    Loads the json from the results of the api call
    Gets the list of documents from the json
    Creates a new json that contains the documents returned from each api call
    :param result: Result of the api call
    :return: returns True if the processing completed successfully
    """
    docs_json = json.loads(result.text)
    try:
        doc_list = docs_json["documents"]
        work = make_docs(doc_list)
    except TypeError:
        logger.error('Error - bad JSON')

    return True
Esempio n. 11
0
def return_docs():
    """
    The endpoint the client calls to return the document ids received from the regulations docs calls
    :return: Returns a string saying successful so the client knows the call was successful
    """
    try:
        json_info = request.form['json']
        files = request.files['file'].read()
    except:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    if json_info is None:
        logger.error('Error - could not post docs')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_docs(redis_server(), json.loads(json_info), files)
    return 'Successful!'
Esempio n. 12
0
def get_work():
    """
    Endpoint the user will use to get work from the queue
    client_id will be one of the parameters given for logging purposes
    :return: Returns the json containing the job_id, the type of work to be done, the work that nees to be done, and
    the version number
    """
    logger.warning("Successful API Call: %s", 'get_work: get_work')
    if len(request.args) != 1:
        logger.error('Error - number of parameters incorrect')
        return 'Parameter Missing', 400
    client_id = request.args.get('client_id')
    if client_id is None:
        logger.warning("Exception: %s", 'get_work: BadParameterException, client id was none')
        logger.error('Error - no client ID')
        return 'Bad Parameter', 400
    json_info = redis_server().get_work()
    return json.dumps(json_info)
Esempio n. 13
0
def api_call_manager(url):
    """
    If there were no errors in making an API call, get the result
    If a Temporary error occurred, sleep for 5 minutes and try again.
    Do this 50 times, and if it continues to fail, raise a CallFailException
    If a Permanent error occurs, raise a CallFailException
    If the user's ApiCount is zero, sleep for one hour to refresh the calls
    :param url: the url that will be used to make the API call
    :return: returns the resulting information of the documents
    """

    pause = 0
    while pause < 51:
        try:
            result = call(url)
            return result
        except TemporaryException:
            logger.error('API call Error, waiting 5 minutes')
            time.sleep(300)
            pause += 1
        except PermanentException:
            logger.error('API call Error')
            break
        except ApiCountZeroException:
            logger.warning('API calls exhausted')
            time.sleep(3600)
    logger.error('API call failed')
    raise CallFailException
Esempio n. 14
0
def download_doc_formats(dirpath, doc_json, documentId):
    """
    Download the other formats for the document
    :param dirpath: path to the directory where the download will be saved
    :param doc_json: the json from a single document api call
    :param documentId: the string of a documentId
    :return: the total number of requests used to download the extra formats
    """
    total_requests = 0
    try:
        extra_formats = doc_json["fileFormats"]
        total_requests += len(extra_formats)
        for extra_doc in extra_formats:
            result = api_call_manager(add_api_key(str(extra_doc)))
            here = extra_doc.index("contentType") + 12
            type = extra_doc[here:]
            download_document(dirpath, documentId, result, type)
    except KeyError:
        pass
    except CallFailException:
        logger.error('Error - Call failed')
        pass
    return total_requests
Esempio n. 15
0
def read_value(value, string, config_path):
    """
    Reads a file from the configuration JSON file.
    :param value: Value to be read from the JSON
    :return: Value read from the JSON
    """

    try:
        config = ConfigParser()
        config.read(config_path)
        result = config[string][value]
    except FileNotFoundError:
        logger.error('Error - File Not Found')
        return None
    except IOError:
        logger.error('Error - Invalid Input/Output')
        return None
    except KeyError:
        logger.error('API Key Error')
        return None
    else:
        return result
Esempio n. 16
0
 def __init__(self):
     logger.error('Error connecting to API')
Esempio n. 17
0
 def __init__(self):
     logger.error('Error - API call failed')
Esempio n. 18
0
 def __init__(self):
     logger.error('Error - ran out of API calls')
Esempio n. 19
0
 def __init__(self):
     logger.error('Error with the API call')