def get_file_list(compressed_file, compressed_file_path, client_id): """ Get the list of files to be processed from a compressed file :param compressed_file: file containing file list to be uncompressed :param compressed_file_path: location of the file in string form :param client_id: the id of the client that did the job :return: The list of file names in the compressed file """ client_path = config.server_read_value( 'client path') + 'client-logs/' + client_id + '/' files = zipfile.ZipFile(compressed_file, 'r') files.extractall(compressed_file_path) file_list = os.listdir(compressed_file_path) final_list = [] for file in file_list: if file.startswith('doc.'): final_list.append(file) elif file.endswith('.log'): if not os.path.exists(client_path): os.makedirs(client_path) shutil.copy(compressed_file_path + file, client_path) else: shutil.copy(compressed_file_path + file, client_path) return final_list, compressed_file_path
def test_process_docs(): redis_server = make_database() json_data = json.dumps({ 'job_id': '1', 'type': 'docs', 'data': [[{ "id": "AHRQ_FRDOC_0001-0037", "count": 1 }]], 'client_id': 'Alex', 'version': '0.0.0' }) redis_server.add_to_progress(json_data) json_data = json.loads(json_data) compressed_file = PATH + 'Archive.zip' dsf.process_docs( redis_server, json_data, compressed_file, config.server_read_value('regulations path') + 'regulations-data/') queue = redis_server.get_all_items_in_queue() progress = redis_server.get_all_items_in_progress() assert len(queue) == 1 assert len(progress) == 0
def save_client_log(client_id, compressed_file): """ :param client_id: :param compressed_file: :return: """ logger.warning('ms/docs_filter/save_client_log: function called') client_path = config.server_read_value( 'client path') + 'client-logs/' + client_id + '/' files = zipfile.ZipFile(compressed_file, 'r') temp_directory = tempfile.mkdtemp() temp_directory_path = str(temp_directory + '/') files.extractall(temp_directory_path) # Create a list of all the files in the directory file_list = os.listdir(temp_directory_path) for file in file_list: if file.endswith('.log'): logger.warning('ms/docs_filter/save_client_log: found file, ' + str(file) + ', that ends with log') if not os.path.exists(client_path): os.makedirs(client_path) shutil.copy(temp_directory_path + file, client_path) logger.warning('ms/docs_filter/save_client_log: ' 'saving log to client-logs directory') else: shutil.copy(temp_directory_path + file, client_path) logger.warning('ms/docs_filter/save_client_log: ' 'saving log to client-logs directory')
def test_doc_job_return_doc_saved(mock_dir, client): rm = RedisManager() rm.add_to_queue( json.dumps({ "data": [{ "id": "AAAA-AAAA-0001-0001", "count": 1 }], "version": "v0.5", "type": "doc", "job_id": "1234" })) result = client.get('/get_work?client_id=asdf') client.post("/return_doc", data={ 'file': open(PATH + "test_single_doc.zip", 'rb'), 'json': json.dumps({ 'job_id': "1234", 'type': 'doc', 'client_id': "abcd", "version": "0.5" }) }) assert len(rm.get_all_items_in_queue()) == 0 count = get_count_of_doc() assert count == 1 shutil.rmtree(config.server_read_value('regulations path')) assert rm.does_job_exist_in_progress('1234') is False
def monolith(): """ Runs the script. This is one monolithic function (aptly named) as the script just needs to be run; however, there is a certain point where I need to break out of the program if an error occurs, and I wasn't sure how exactly sys.exit() would work and whether or not it would mess with things outside of / calling this script, so I just made one giant method so I can return when needed. :return: """ url_base = 'https://api.data.gov/regulations/v3/documents.json?rpp=1000' r = redis_manager.RedisManager() regulations_key = config.server_read_value('api key') current_page = 0 if regulations_key != '': # Gets number of documents available to download try: url = \ 'https://api.data.gov/regulations/v3/documents.json?api_key=' \ + regulations_key + '&countsOnly=1' record_count = \ api_manager.api_call_manager(url).json()['totalNumRecords'] except api_manager.CallFailException: logger.error('Error occured with API request') print('Error occurred with docs_work_gen regulations API request.') return 0 # Gets the max page we'll go to; each page is 1000 documents max_page_hit = record_count // 1000 # This loop generates lists of URLs, sending out a job and # writing them to the work server every 1000 URLs. # It will stop and send whatever's left if we hit the max page limit. while current_page < max_page_hit: url_list = [] for i in range(1000): current_page += 1 url_full = url_base + '&po=' + str(current_page * 1000) url_list.append(url_full) if current_page == max_page_hit: break # Makes a JSON from the list of URLs and send # it to the queue as a job docs_work = [ ''.join( random.choices(string.ascii_letters + string.digits, k=16)), 'docs', url_list ] r.add_to_queue(endpoints.generate_json(docs_work)) else: print('No API Key!')
def get_count_of_doc(): count = 0 for name in os.listdir( config.server_read_value('regulations path') + 'regulations-data/'): for org in os.listdir( config.server_read_value('regulations path') + 'regulations-data/' + name): for id in os.listdir( config.server_read_value('regulations path') + 'regulations-data/' + name + '/' + org): for num in os.listdir( config.server_read_value('regulations path') + 'regulations-data/' + name + '/' + org + '/' + id): if os.path.isfile( config.server_read_value('regulations path') + 'regulations-data/' + name + '/' + org + '/' + id + '/' + num): count += 1 return count
def test_doc_client(): job_id = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) server_url = 'http://' \ + config.client_read_value('ip') \ + ':' \ + config.client_read_value('port') \ + '/get_work?client_id=' \ + config.client_read_value('client id') doc_url = 'https://api.data.gov/regulations/v3/document?documentId=TEST-1-0' doc_url_with_download = doc_url + '&attachmentNumber=0&contentType=pdf' client_health_url = 'https://hc-ping.com/457a1034-83d4-4a62-8b69-c71060db3a08' server_dict = { 'job_id': job_id, 'type': 'doc', 'data': [{ 'id': 'TEST-1-0', 'count': 1 }], 'version': version } doc_text = json.dumps({'fileFormats': [doc_url_with_download]}) with req_mock() as m, \ patch('requests.post') as p: m.get(server_url, status_code=200, text=json.dumps(server_dict)) m.get(client_add_api_key(doc_url), status_code=200, text=doc_text) m.get(client_add_api_key(doc_url_with_download), status_code=200, body=open( config.server_read_value('regulations path') + 'TEST-1-0.pdf')) m.get(client_health_url, status_code=200) do_work() temp_directory = tempfile.mkdtemp() temp_directory_path = str(temp_directory) + '/' files = zipfile.ZipFile( BytesIO(p.call_args[1]['files']['file'][1].read()), 'r') files.extractall(temp_directory_path) file_list = os.listdir(temp_directory_path) assert 'mirrulations.log' in file_list assert 'doc.TEST-1-0.json' in file_list assert 'doc.TEST-1-0.pdf' in file_list
def return_doc(): """ The endpoint the client calls to return documents they received from the individual regulations doc calls :return: Returns a string saying successful so the client knows the call was successful """ try: files = request.files['file'].read() json_info = request.form['json'] except Exception: logger.error('Error - bad parameter') return 'Bad Parameter', 400 files = io.BytesIO(files) process_doc(redis_server(), json.loads(json_info), files, config.server_read_value('regulations path') + 'regulations-data/') return 'Successful!'
def return_docs(): """ The endpoint the client calls to return the document ids received from the regulations docs calls :return: Returns a string saying successful so the client knows the call was successful """ try: json_info = request.form['json'] files = request.files['file'].read() except Exception: logger.error('Error - bad parameter') return 'Bad Parameter', 400 if json_info is None: logger.error('Error - Could not post docs') return 'Bad Parameter', 400 files = io.BytesIO(files) process_docs(redis_server(), json.loads(json_info), files, config.server_read_value('regulations path') + 'regulations-data/') return 'Successful!'
def test_check_document_exists(): test_data = generate_json_data(PATH + '1_workfile_2_documents.json') test_data = dsf.check_document_exists( test_data, config.server_read_value('regulations path') + 'regulations-data/') assert test_data['data'] == [[{'id': 'AHRQ_FRDOC_0001-0037', 'count': 1}]]