class Loops(object): continue_processing = False processing_started = False lock = asyncio.Lock() def __init__(self): self.use_es = False self.config = Config() self.status = Status() self.storage = Storage() self.hash_json = Hash_Json() self.events = Events_Log(self.config.hd2_status_location) self.events_elastic = Events_Log_Elastic() self.hash = None self.report_elastic = Report_Elastic() self.analysis_elastic = Analysis_Elastic() self.report_elastic.setup() self.analysis_elastic.setup() create_folder(self.storage.hd2_processed()) create_folder(self.storage.hd2_not_processed()) def IsProcessing(self): return Loops.processing_started def StopProcessing(self): Loops.continue_processing = False def HasBeenStopped(self): return not Loops.continue_processing def git_commit(self): git_commit = 'Not available' try: git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD' ]).decode("utf-8").rstrip() except Exception as e: pass return git_commit def ProcessDirectoryWithEndpoint(self, itempath, file_hash, endpoint_index): if not os.path.isdir(itempath): return False log_info( message= f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}" ) meta_service = Metadata_Service() original_file_path = meta_service.get_original_file_paths(itempath) events = Events_Log(itempath) endpoint = "http://" + self.config.endpoints['Endpoints'][ endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][ endpoint_index]['Port'] events.add_log("Processing with: " + endpoint) meta_service.set_f2f_plugin_version(itempath, API_VERSION) meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit()) try: file_processing = File_Processing(events, self.events_elastic, self.report_elastic, self.analysis_elastic, meta_service) if not file_processing.processDirectory(endpoint, itempath): events.add_log("CANNOT be processed") return False log_data = { 'file': original_file_path, 'status': FileStatus.COMPLETED, 'error': 'none', 'timestamp': datetime.now(), } log_info('ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, "none") meta_service.set_status(itempath, FileStatus.COMPLETED) self.hash_json.update_status(file_hash, FileStatus.COMPLETED) events.add_log("Has been processed") return True except Exception as error: log_data = { 'file': original_file_path, 'status': FileStatus.FAILED, 'error': str(error), } log_error(message='error in ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, str(error)) meta_service.set_status(itempath, FileStatus.FAILED) self.hash_json.update_status(file_hash, FileStatus.FAILED) events.add_log("ERROR:" + str(error)) return False def ProcessDirectory(self, thread_data): (itempath, file_hash, process_index) = thread_data endpoint_index = process_index % self.config.endpoints_count if not Loops.continue_processing: return False tik = datetime.now() process_result = self.ProcessDirectoryWithEndpoint( itempath, file_hash, endpoint_index) if process_result: self.status.add_completed() tok = datetime.now() delta = tok - tik meta_service = Metadata_Service() meta_service.set_hd2_to_hd3_copy_time(itempath, delta.total_seconds()) else: self.status.add_failed() return process_result # note: removing retries from this method (it should not be handled like this #for idx in range(self.config.endpoints_count): # if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index): # return # # The Endpoint failed to process the file # # Retry it with the next one # endpoint_index = (endpoint_index + 1) % self.config.endpoints_count def updateHashJson(self): self.hash_json.reset() meta_service = Metadata_Service() for hash_folder in os.listdir(self.storage.hd2_data()): metadata_folder = self.storage.hd2_data(hash_folder) if not os.path.isdir(metadata_folder): continue metadata = meta_service.get_from_file(metadata_folder) file_name = metadata.get_file_name() original_hash = metadata.get_original_hash() status = metadata.get_rebuild_status() if status != FileStatus.COMPLETED: self.hash_json.add_file(original_hash, file_name) self.hash_json.save() self.status.set_processing_counters(len(self.hash_json.data())) return self.hash_json.data() def moveProcessedFiles(self): json_list = self.hash_json.data() for key in json_list: source_path = self.storage.hd2_data(key) if (FileStatus.COMPLETED == json_list[key]["file_status"]): destination_path = self.storage.hd2_processed(key) if folder_exists(destination_path): folder_delete_all(destination_path) shutil.move(source_path, destination_path) if (FileStatus.FAILED == json_list[key]["file_status"]): meta_service = Metadata_Service() meta_service.get_from_file(source_path) metadata = meta_service.metadata if ("Engine response could not be decoded" == metadata.get_error()) and \ metadata.get_original_file_extension() in ['.xml', '.json']: destination_path = self.storage.hd2_not_processed(key) if folder_exists(destination_path): folder_delete_all(destination_path) shutil.move(source_path, destination_path) def LoopHashDirectoriesInternal(self, thread_count, do_single): if folder_exists(self.storage.hd2_data()) is False: log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data( ) log_error(log_message) return False if not isinstance(thread_count, int): raise TypeError("thread_count must be a integer") if not isinstance(do_single, bool): raise TypeError("thread_count must be a integer") log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) json_list = self.updateHashJson() log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) threads = list() process_index = 0 log_info( message=f'before Mapping thread_data for {len(json_list)} files') thread_data = [] for key in json_list: file_hash = key itempath = self.storage.hd2_data(key) if (FileStatus.COMPLETED == json_list[key]["file_status"]): self.events.add_log( f"The file processing has been already completed") continue if not os.path.exists(itempath): self.events.add_log( f"ERROR: Path \"{itempath}\" does not exist") json_list[key]["file_status"] = FileStatus.FAILED continue process_index += 1 thread_data.append(( itempath, file_hash, process_index, )) # # limit the number of parallel threads # # if process_index % int(thread_count) == 0: # todo: refactor this workflow to use multiprocess and queues # # Clean up the threads # for index, thread in enumerate(threads): # todo: since at the moment this will block allocating new threads until # thread.join() # all have finishing execution # # process_index += 1 # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }") # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,)) # threads.append(x) # x.start() # # if do_single: # break # # if not Loops.continue_processing: # break # for index, thread in enumerate(threads): # thread.join() log_info( message= f'after mapped thread_data, there are {len(thread_data)} mapped items' ) #thread_data = thread_data[:500] #log_info(message=f'to start with only processing {len(thread_data)} thread_data items') pool = ThreadPool(thread_count) results = pool.map(self.ProcessDirectory, thread_data) pool.close() pool.join() self.moveProcessedFiles() self.events.add_log("LoopHashDirectoriesInternal finished") return True async def LoopHashDirectoriesAsync(self, thread_count, do_single=False): await Loops.lock.acquire() try: Loops.continue_processing = True Loops.processing_started = True self.status.set_started() self.LoopHashDirectoriesInternal(thread_count, do_single) finally: Loops.processing_started = False Loops.lock.release() self.status.set_stopped() self.hash_json.save() @log_duration def LoopHashDirectories(self, thread_count=None): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( message= "ERROR: Attempt to start processing while processing is in progress" ) return False self.status.StartStatusThread() thread_count = thread_count or self.config.thread_count log_info(message="in LoopHashDirectories, about to start main loop") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count)) log_info(message="in LoopHashDirectories, Loop completed") self.status.StopStatusThread() return True @log_duration def LoopHashDirectoriesSequential(self): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1)) return True @log_duration def ProcessSingleFile(self): if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True)) return True
class test_Hash_Json(TestCase): test_file = None @classmethod def setUpClass(cls) -> None: cls.test_file = temp_file(contents='Static text so that we have a static hash') cls.test_file_name = file_name(cls.test_file) cls.test_file_hash = '500286533bf75d769e9180a19414d1c3502dd52093e7351a0a9b1385d8f8961c' @classmethod def tearDownClass(cls) -> None: file_delete(cls.test_file) def setUp(self) -> None: self.hash_json = Hash_Json() self.storage = self.hash_json.storage def test___init__(self): assert abspath(self.hash_json.folder()) == self.storage.hd2_status() @patch("multiprocessing.queues.Queue.put_nowait") def test_add_file(self, patch_log_error): hash_data = self.hash_json.load() if hash_data.get('self.test_file_hash'): del hash_data[self.test_file_hash] assert self.hash_json.add_file(self.test_file_hash, self.test_file_name) is True self.hash_json.save() assert hash_data.get(self.test_file_hash) == {'file_name': self.test_file_name, 'file_status': 'Initial'} assert self.hash_json.add_file('AAAA' , self.test_file_name) is False assert self.hash_json.add_file(self.test_file_hash , None ) is False assert self.hash_json.add_file(None , None ) is False assert patch_log_error.mock_calls == [call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': 'AAAA' , 'file_name': self.test_file_name}, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}), call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': self.test_file_hash, 'file_name': None }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}), call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': None , 'file_name': None }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'})] def test_get_file_path(self): file_path = abspath(self.hash_json.get_file_path()) assert file_exists(file_path) assert file_path == path_combine(self.storage.hd2_status(), Hash_Json.HASH_FILE_NAME) def test_load(self): data = self.hash_json.load() assert type(data) is dict assert self.hash_json.data() == data def test_data(self): assert self.hash_json.data() == self.hash_json._hash_json_data def test_is_hash(self): test_file = temp_file(contents='aaaa') file_hash = Metadata_Utils().file_hash(test_file) # create hash from file text_hash = str_sha256('asd') # create hash from string assert self.hash_json.is_hash(file_hash ) is True # confirm both are valid hashes assert self.hash_json.is_hash(text_hash ) is True assert self.hash_json.is_hash(None ) is False # testing all sorts of conner cases assert self.hash_json.is_hash('' ) is False # empty strings assert self.hash_json.is_hash('aaaa' ) is False # non hash string assert self.hash_json.is_hash(file_hash + 'aaaa') is False # confirm only exact matches work assert self.hash_json.is_hash(text_hash + 'aaaa') is False assert self.hash_json.is_hash('aaa' + file_hash ) is False assert self.hash_json.is_hash(text_hash + '\nb`') is False # confirm content in new lines is also not a match assert self.hash_json.is_hash('a\n' + file_hash ) is False file_delete(test_file) def test_save(self): target_file = temp_file() # temp file to save data assert file_not_exists(target_file) # confirm it doesn't exist with patch.object(Hash_Json, 'get_file_path', return_value=target_file): # patch get_file_path to return temp file path assert self.hash_json.get_file_path() == target_file # confirm patch is in place self.hash_json.save() # call write_to_file assert file_exists(target_file) # confirm temp file now exists assert self.hash_json.load() == self.hash_json.data() # confirm reloaded data is correct assert json_load_file(target_file) == self.hash_json.data() # also confirm using direct json load of temp file assert self.hash_json.get_file_path() != target_file # confirm pathc is not there (after 'with' ends) file_delete(target_file) # delete temp file def test_update_status(self): temp_data_file = temp_file() with patch.object(Hash_Json, 'get_file_path', return_value=temp_data_file): self.hash_json.add_file(self.test_file_hash, self.test_file_name) assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'Initial' self.hash_json.update_status(self.test_file_hash, 'BBBB') self.hash_json.save() assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'BBBB' assert json_load_file(temp_data_file)[self.test_file_hash]['file_status'] == 'BBBB' pprint(self.hash_json.load()) def test_data_bug(self): # this test confirms the bug hashes = self.hash_json.data() for hash in self.hash_json.data(): if len(hash) == 64: # all keys in this object should be a hash assert len(hash) == 64 assert type(hashes[hash]) == dict # with all items being a dictionary assert list_set(hashes[hash]) == ['file_name', 'file_status'] else: assert hash == "file_list" # but the old schema is still present assert type(hashes[hash]) == list # with the data being a list assert list_set(hashes[hash][0]) == ['file_name', 'file_status', 'hash', 'id']