Example #1
0
def get_file_list(compressed_file, compressed_file_path, client_id):
    """
    Get the list of files to be processed from a compressed file
    :param compressed_file: file containing file list to be uncompressed
    :param compressed_file_path: location of the file in string form
    :param client_id: the id of the client that did the job
    :return: The list of file names in the compressed file
    """
    client_path = config.server_read_value(
        'client path') + 'client-logs/' + client_id + '/'
    files = zipfile.ZipFile(compressed_file, 'r')
    files.extractall(compressed_file_path)

    file_list = os.listdir(compressed_file_path)

    final_list = []
    for file in file_list:
        if file.startswith('doc.'):
            final_list.append(file)
        elif file.endswith('.log'):
            if not os.path.exists(client_path):
                os.makedirs(client_path)
                shutil.copy(compressed_file_path + file, client_path)
            else:
                shutil.copy(compressed_file_path + file, client_path)
    return final_list, compressed_file_path
Example #2
0
def test_process_docs():
    redis_server = make_database()
    json_data = json.dumps({
        'job_id':
        '1',
        'type':
        'docs',
        'data': [[{
            "id": "AHRQ_FRDOC_0001-0037",
            "count": 1
        }]],
        'client_id':
        'Alex',
        'version':
        '0.0.0'
    })
    redis_server.add_to_progress(json_data)
    json_data = json.loads(json_data)
    compressed_file = PATH + 'Archive.zip'

    dsf.process_docs(
        redis_server, json_data, compressed_file,
        config.server_read_value('regulations path') + 'regulations-data/')
    queue = redis_server.get_all_items_in_queue()
    progress = redis_server.get_all_items_in_progress()
    assert len(queue) == 1
    assert len(progress) == 0
Example #3
0
def save_client_log(client_id, compressed_file):
    """
    :param client_id:
    :param compressed_file:
    :return:
    """
    logger.warning('ms/docs_filter/save_client_log: function called')
    client_path = config.server_read_value(
        'client path') + 'client-logs/' + client_id + '/'

    files = zipfile.ZipFile(compressed_file, 'r')

    temp_directory = tempfile.mkdtemp()
    temp_directory_path = str(temp_directory + '/')

    files.extractall(temp_directory_path)

    # Create a list of all the files in the directory
    file_list = os.listdir(temp_directory_path)
    for file in file_list:
        if file.endswith('.log'):
            logger.warning('ms/docs_filter/save_client_log: found file, ' +
                           str(file) + ', that ends with log')
            if not os.path.exists(client_path):
                os.makedirs(client_path)
                shutil.copy(temp_directory_path + file, client_path)
                logger.warning('ms/docs_filter/save_client_log: '
                               'saving log to client-logs directory')
            else:
                shutil.copy(temp_directory_path + file, client_path)
                logger.warning('ms/docs_filter/save_client_log: '
                               'saving log to client-logs directory')
Example #4
0
def test_doc_job_return_doc_saved(mock_dir, client):
    rm = RedisManager()
    rm.add_to_queue(
        json.dumps({
            "data": [{
                "id": "AAAA-AAAA-0001-0001",
                "count": 1
            }],
            "version": "v0.5",
            "type": "doc",
            "job_id": "1234"
        }))
    result = client.get('/get_work?client_id=asdf')

    client.post("/return_doc",
                data={
                    'file':
                    open(PATH + "test_single_doc.zip", 'rb'),
                    'json':
                    json.dumps({
                        'job_id': "1234",
                        'type': 'doc',
                        'client_id': "abcd",
                        "version": "0.5"
                    })
                })
    assert len(rm.get_all_items_in_queue()) == 0
    count = get_count_of_doc()
    assert count == 1
    shutil.rmtree(config.server_read_value('regulations path'))
    assert rm.does_job_exist_in_progress('1234') is False
Example #5
0
def monolith():
    """
    Runs the script. This is one monolithic function (aptly named)
    as the script just needs to be run; however, there is a certain
    point where I need to break out of the program if an error occurs,
    and I wasn't sure how exactly sys.exit() would work and whether
    or not it would mess with things outside of / calling this script,
    so I just made one giant method so I can return when needed.
    :return:
    """
    url_base = 'https://api.data.gov/regulations/v3/documents.json?rpp=1000'
    r = redis_manager.RedisManager()
    regulations_key = config.server_read_value('api key')
    current_page = 0

    if regulations_key != '':
        # Gets number of documents available to download
        try:
            url = \
                'https://api.data.gov/regulations/v3/documents.json?api_key=' \
                + regulations_key + '&countsOnly=1'
            record_count = \
                api_manager.api_call_manager(url).json()['totalNumRecords']
        except api_manager.CallFailException:
            logger.error('Error occured with API request')
            print('Error occurred with docs_work_gen regulations API request.')
            return 0

        # Gets the max page we'll go to; each page is 1000 documents
        max_page_hit = record_count // 1000

        # This loop generates lists of URLs, sending out a job and
        # writing them to the work server every 1000 URLs.
        # It will stop and send whatever's left if we hit the max page limit.
        while current_page < max_page_hit:
            url_list = []
            for i in range(1000):
                current_page += 1
                url_full = url_base + '&po=' + str(current_page * 1000)

                url_list.append(url_full)

                if current_page == max_page_hit:
                    break

            # Makes a JSON from the list of URLs and send
            # it to the queue as a job
            docs_work = [
                ''.join(
                    random.choices(string.ascii_letters + string.digits,
                                   k=16)), 'docs', url_list
            ]
            r.add_to_queue(endpoints.generate_json(docs_work))
    else:
        print('No API Key!')
Example #6
0
def get_count_of_doc():
    count = 0
    for name in os.listdir(
            config.server_read_value('regulations path') +
            'regulations-data/'):
        for org in os.listdir(
                config.server_read_value('regulations path') +
                'regulations-data/' + name):
            for id in os.listdir(
                    config.server_read_value('regulations path') +
                    'regulations-data/' + name + '/' + org):
                for num in os.listdir(
                        config.server_read_value('regulations path') +
                        'regulations-data/' + name + '/' + org + '/' + id):
                    if os.path.isfile(
                            config.server_read_value('regulations path') +
                            'regulations-data/' + name + '/' + org + '/' + id +
                            '/' + num):
                        count += 1
    return count
Example #7
0
def test_doc_client():
    job_id = ''.join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(16))

    server_url = 'http://' \
                 + config.client_read_value('ip') \
                 + ':' \
                 + config.client_read_value('port') \
                 + '/get_work?client_id=' \
                 + config.client_read_value('client id')
    doc_url = 'https://api.data.gov/regulations/v3/document?documentId=TEST-1-0'
    doc_url_with_download = doc_url + '&attachmentNumber=0&contentType=pdf'
    client_health_url = 'https://hc-ping.com/457a1034-83d4-4a62-8b69-c71060db3a08'

    server_dict = {
        'job_id': job_id,
        'type': 'doc',
        'data': [{
            'id': 'TEST-1-0',
            'count': 1
        }],
        'version': version
    }
    doc_text = json.dumps({'fileFormats': [doc_url_with_download]})

    with req_mock() as m, \
            patch('requests.post') as p:

        m.get(server_url, status_code=200, text=json.dumps(server_dict))
        m.get(client_add_api_key(doc_url), status_code=200, text=doc_text)
        m.get(client_add_api_key(doc_url_with_download),
              status_code=200,
              body=open(
                  config.server_read_value('regulations path') +
                  'TEST-1-0.pdf'))
        m.get(client_health_url, status_code=200)

        do_work()

        temp_directory = tempfile.mkdtemp()
        temp_directory_path = str(temp_directory) + '/'

        files = zipfile.ZipFile(
            BytesIO(p.call_args[1]['files']['file'][1].read()), 'r')
        files.extractall(temp_directory_path)
        file_list = os.listdir(temp_directory_path)

        assert 'mirrulations.log' in file_list
        assert 'doc.TEST-1-0.json' in file_list
        assert 'doc.TEST-1-0.pdf' in file_list
Example #8
0
def return_doc():
    """
    The endpoint the client calls to return documents
    they received from the individual regulations doc calls
    :return: Returns a string saying successful so
             the client knows the call was successful
    """

    try:
        files = request.files['file'].read()
        json_info = request.form['json']
    except Exception:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_doc(redis_server(), json.loads(json_info), files, config.server_read_value('regulations path') + 'regulations-data/')
    return 'Successful!'
Example #9
0
def return_docs():
    """
    The endpoint the client calls to return the
    document ids received from the regulations docs calls
    :return: Returns a string saying successful so
             the client knows the call was successful
    """
    try:
        json_info = request.form['json']
        files = request.files['file'].read()
    except Exception:
        logger.error('Error - bad parameter')
        return 'Bad Parameter', 400
    if json_info is None:
        logger.error('Error - Could not post docs')
        return 'Bad Parameter', 400
    files = io.BytesIO(files)
    process_docs(redis_server(), json.loads(json_info), files, config.server_read_value('regulations path') + 'regulations-data/')
    return 'Successful!'
Example #10
0
def test_check_document_exists():
    test_data = generate_json_data(PATH + '1_workfile_2_documents.json')
    test_data = dsf.check_document_exists(
        test_data,
        config.server_read_value('regulations path') + 'regulations-data/')
    assert test_data['data'] == [[{'id': 'AHRQ_FRDOC_0001-0037', 'count': 1}]]