def __init__(self, file_hash=None): self.config = Config() self.storage = Storage() self.process_status = Status() self.metadata_utils = Metadata_Utils() self.path_hd1 = self.storage.hd1() self.data = self.default_data() self.file_hash = file_hash
def __init__(self): if hasattr(self, 'instantiated' ) is False: # only set these values first time around self.instantiated = True self.storage = Storage() #self._on_save = [] # todo: add support for firing up events when data is saved self._status_data = self.default_data() self.status_thread_on = False self.status_thread = threading.Thread()
def __init__(self): self.config = Config() self.hd1_base_location = self.config.hd1_location self.hd2_base_location = self.config.hd2_location self.hd3_base_location = self.config.hd3_location self.zip_folder = os.path.join(os.getcwd(),"zip_folder") self.storage = Storage() folder_delete_all(self.zip_folder) create_folder(self.zip_folder)
def __init__(self): self.config = Config() self.meta_service = Metadata_Service() self.status = Status() self.storage = Storage() self.file_name = None # set in process() method self.current_path = None self.base_folder = None self.dst_folder = None self.dst_file_name = None self.status = Status() self.status.reset()
def __init__(self): self.use_es = False self.config = Config() self.status = Status() self.storage = Storage() self.hash_json = Hash_Json() self.events = Events_Log(self.config.hd2_status_location) self.events_elastic = Events_Log_Elastic() self.hash = None self.report_elastic = Report_Elastic() self.analysis_elastic = Analysis_Elastic() self.report_elastic.setup() self.analysis_elastic.setup() create_folder(self.storage.hd2_processed()) create_folder(self.storage.hd2_not_processed())
def __init__(self): self.config = Config() self.storage = Storage() self.folder = os.path.join(self.config.hd2_location, "status") self.analysis_data = {} self.id = 0 self.get_from_file()
def __init__(self): if hasattr(self, '_hash_json_data' ) is False: # only set these values first time around self.config = Config() self.storage = Storage() self._hash_json_data = {} self.load()
class test_Storage(Temp_Config): def setUp(self) -> None: self.config = Config() self.local_storage = Storage() def test_hd1_hd2_hd3(self): assert self.local_storage.hd1() == abspath(self.config.hd1_location) assert self.local_storage.hd2() == abspath(self.config.hd2_location) assert self.local_storage.hd3() == abspath(self.config.hd3_location) def test_hd1_add_file(self): test_file = temp_file(contents=random_text()) test_file_name = file_name(test_file) file_in_hd1 = self.storage.hd1_add_file(test_file) assert file_exists(file_in_hd1) assert file_contents(file_in_hd1) == file_contents(test_file) assert self.storage.hd1_file_path(test_file_name) == file_in_hd1 def test_hd1_files(self): new_files = self.add_test_files(count=2) hd1_files = self.storage.hd1_files() assert len(hd1_files) >= len(new_files) assert new_files[0] in hd1_files assert new_files[1] in hd1_files def test_hd2_metadatas(self): self.add_test_files(count=10, text_size=1000, execute_stage_1=True) metadatas = self.storage.hd2_metadatas() assert list_set(metadatas[0]) == [ 'error', 'f2f_plugin_git_commit', 'f2f_plugin_version', 'file_name', 'hd1_to_hd2_copy_time', 'hd2_to_hd3_copy_time', 'last_update_time', 'original_file_extension', 'original_file_paths', 'original_file_size', 'original_hash', 'original_hash_calculation_time', 'rebuild_file_duration', 'rebuild_file_extension', 'rebuild_file_path', 'rebuild_file_size', 'rebuild_hash', 'rebuild_server', 'rebuild_status', 'server_version', 'xml_report_status'] @pytest.mark.skip("needs more work to be solid") def test_hd3_files(self): count = 1 self.add_test_files(count=count, execute_stage_1=True) loops = Loops() result = loops.LoopHashDirectories() metadatas = self.storage.hd2_metadatas() hd3_files = self.storage.hd3_files() metadata = metadatas[0] hd3_file = hd3_files[0] assert result is True assert len(hd3_files) == count assert len(metadatas) == count assert b'Glasswall Processed' in file_contents_as_bytes(hd3_file) assert metadata.get('rebuild_status') == 'Completed Successfully'
def setUpClass(cls) -> None: cls.setup_testing = Setup_Testing() cls.log_worker = start_logging() cls.storage = Storage() cls.config = cls.storage.config from cdr_plugin_folder_to_folder.utils.Logging import log_info log_info(message='in Temp_Config') cls.setup_testing.set_config_to_temp_folder()
def setUp(self) -> None: self.sdk_server = self.config.test_sdk self.sdk_port = '8080' self.temp_folder = temp_folder() self.events_log = Events_Log(self.temp_folder) self.events_elastic = Events_Log_Elastic() self.report_elastic = Report_Elastic() self.analysis_elastic = Analysis_Elastic() self.file_processing = File_Processing(events_log=self.events_log, events_elastic = self.events_elastic, report_elastic=self.report_elastic, analysis_elastic= self.analysis_elastic, meta_service=self.meta_service ) self.storage = Storage()
def __init__(self, events_log, events_elastic, report_elastic, analysis_elastic, meta_service): self.meta_service = meta_service self.events_log = events_log self.events_elastic = events_elastic self.storage = Storage() self.config = Config() self.status = Status() self.hash_json = Hash_Json() self.report_elastic = report_elastic self.sdk_api_version = "Not available" self.sdk_engine_version = "Not available" self.analysis_json = Analysis_Json() self.analysis_elastic = analysis_elastic
def test_set_config_to_temp_folder__restore_config(self): storage = Storage() config = storage.config original_config = config.values() self.setup_testing.set_config_to_temp_folder() temp_config = config.values() assert parent_folder(config.root_folder ) == temp_folder_current() assert folder_exists(config.root_folder ) assert folder_exists(storage.hd1() ) assert folder_exists(storage.hd2_status()) assert folder_exists(storage.hd2_data() ) assert folder_exists(storage.hd3() ) assert original_config != temp_config self.setup_testing.restore_config() #self.setup_testing.configure_static_logging() assert original_config == config.values() assert parent_folder(config.root_folder) != temp_folder_current() assert folder_not_exists(temp_config.get('root_folder'))
class Loops(object): continue_processing = False processing_started = False lock = asyncio.Lock() def __init__(self): self.use_es = False self.config = Config() self.status = Status() self.storage = Storage() self.hash_json = Hash_Json() self.events = Events_Log(self.config.hd2_status_location) self.events_elastic = Events_Log_Elastic() self.hash = None self.report_elastic = Report_Elastic() self.analysis_elastic = Analysis_Elastic() self.report_elastic.setup() self.analysis_elastic.setup() create_folder(self.storage.hd2_processed()) create_folder(self.storage.hd2_not_processed()) def IsProcessing(self): return Loops.processing_started def StopProcessing(self): Loops.continue_processing = False def HasBeenStopped(self): return not Loops.continue_processing def git_commit(self): git_commit = 'Not available' try: git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD' ]).decode("utf-8").rstrip() except Exception as e: pass return git_commit def ProcessDirectoryWithEndpoint(self, itempath, file_hash, endpoint_index): if not os.path.isdir(itempath): return False log_info( message= f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}" ) meta_service = Metadata_Service() original_file_path = meta_service.get_original_file_paths(itempath) events = Events_Log(itempath) endpoint = "http://" + self.config.endpoints['Endpoints'][ endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][ endpoint_index]['Port'] events.add_log("Processing with: " + endpoint) meta_service.set_f2f_plugin_version(itempath, API_VERSION) meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit()) try: file_processing = File_Processing(events, self.events_elastic, self.report_elastic, self.analysis_elastic, meta_service) if not file_processing.processDirectory(endpoint, itempath): events.add_log("CANNOT be processed") return False log_data = { 'file': original_file_path, 'status': FileStatus.COMPLETED, 'error': 'none', 'timestamp': datetime.now(), } log_info('ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, "none") meta_service.set_status(itempath, FileStatus.COMPLETED) self.hash_json.update_status(file_hash, FileStatus.COMPLETED) events.add_log("Has been processed") return True except Exception as error: log_data = { 'file': original_file_path, 'status': FileStatus.FAILED, 'error': str(error), } log_error(message='error in ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, str(error)) meta_service.set_status(itempath, FileStatus.FAILED) self.hash_json.update_status(file_hash, FileStatus.FAILED) events.add_log("ERROR:" + str(error)) return False def ProcessDirectory(self, thread_data): (itempath, file_hash, process_index) = thread_data endpoint_index = process_index % self.config.endpoints_count if not Loops.continue_processing: return False tik = datetime.now() process_result = self.ProcessDirectoryWithEndpoint( itempath, file_hash, endpoint_index) if process_result: self.status.add_completed() tok = datetime.now() delta = tok - tik meta_service = Metadata_Service() meta_service.set_hd2_to_hd3_copy_time(itempath, delta.total_seconds()) else: self.status.add_failed() return process_result # note: removing retries from this method (it should not be handled like this #for idx in range(self.config.endpoints_count): # if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index): # return # # The Endpoint failed to process the file # # Retry it with the next one # endpoint_index = (endpoint_index + 1) % self.config.endpoints_count def updateHashJson(self): self.hash_json.reset() meta_service = Metadata_Service() for hash_folder in os.listdir(self.storage.hd2_data()): metadata_folder = self.storage.hd2_data(hash_folder) if not os.path.isdir(metadata_folder): continue metadata = meta_service.get_from_file(metadata_folder) file_name = metadata.get_file_name() original_hash = metadata.get_original_hash() status = metadata.get_rebuild_status() if status != FileStatus.COMPLETED: self.hash_json.add_file(original_hash, file_name) self.hash_json.save() self.status.set_processing_counters(len(self.hash_json.data())) return self.hash_json.data() def moveProcessedFiles(self): json_list = self.hash_json.data() for key in json_list: source_path = self.storage.hd2_data(key) if (FileStatus.COMPLETED == json_list[key]["file_status"]): destination_path = self.storage.hd2_processed(key) if folder_exists(destination_path): folder_delete_all(destination_path) shutil.move(source_path, destination_path) if (FileStatus.FAILED == json_list[key]["file_status"]): meta_service = Metadata_Service() meta_service.get_from_file(source_path) metadata = meta_service.metadata if ("Engine response could not be decoded" == metadata.get_error()) and \ metadata.get_original_file_extension() in ['.xml', '.json']: destination_path = self.storage.hd2_not_processed(key) if folder_exists(destination_path): folder_delete_all(destination_path) shutil.move(source_path, destination_path) def LoopHashDirectoriesInternal(self, thread_count, do_single): if folder_exists(self.storage.hd2_data()) is False: log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data( ) log_error(log_message) return False if not isinstance(thread_count, int): raise TypeError("thread_count must be a integer") if not isinstance(do_single, bool): raise TypeError("thread_count must be a integer") log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) json_list = self.updateHashJson() log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) threads = list() process_index = 0 log_info( message=f'before Mapping thread_data for {len(json_list)} files') thread_data = [] for key in json_list: file_hash = key itempath = self.storage.hd2_data(key) if (FileStatus.COMPLETED == json_list[key]["file_status"]): self.events.add_log( f"The file processing has been already completed") continue if not os.path.exists(itempath): self.events.add_log( f"ERROR: Path \"{itempath}\" does not exist") json_list[key]["file_status"] = FileStatus.FAILED continue process_index += 1 thread_data.append(( itempath, file_hash, process_index, )) # # limit the number of parallel threads # # if process_index % int(thread_count) == 0: # todo: refactor this workflow to use multiprocess and queues # # Clean up the threads # for index, thread in enumerate(threads): # todo: since at the moment this will block allocating new threads until # thread.join() # all have finishing execution # # process_index += 1 # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }") # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,)) # threads.append(x) # x.start() # # if do_single: # break # # if not Loops.continue_processing: # break # for index, thread in enumerate(threads): # thread.join() log_info( message= f'after mapped thread_data, there are {len(thread_data)} mapped items' ) #thread_data = thread_data[:500] #log_info(message=f'to start with only processing {len(thread_data)} thread_data items') pool = ThreadPool(thread_count) results = pool.map(self.ProcessDirectory, thread_data) pool.close() pool.join() self.moveProcessedFiles() self.events.add_log("LoopHashDirectoriesInternal finished") return True async def LoopHashDirectoriesAsync(self, thread_count, do_single=False): await Loops.lock.acquire() try: Loops.continue_processing = True Loops.processing_started = True self.status.set_started() self.LoopHashDirectoriesInternal(thread_count, do_single) finally: Loops.processing_started = False Loops.lock.release() self.status.set_stopped() self.hash_json.save() @log_duration def LoopHashDirectories(self, thread_count=None): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( message= "ERROR: Attempt to start processing while processing is in progress" ) return False self.status.StartStatusThread() thread_count = thread_count or self.config.thread_count log_info(message="in LoopHashDirectories, about to start main loop") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count)) log_info(message="in LoopHashDirectories, Loop completed") self.status.StopStatusThread() return True @log_duration def LoopHashDirectoriesSequential(self): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1)) return True @log_duration def ProcessSingleFile(self): if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True)) return True
def __init__(self): self.index_name = 'files_metadata' self.id_key = 'original_hash' self.time_field = DEFAULT_TIME_FIELD self.enabled = False self.storage = Storage()
class Metadata_Elastic: def __init__(self): self.index_name = 'files_metadata' self.id_key = 'original_hash' self.time_field = DEFAULT_TIME_FIELD self.enabled = False self.storage = Storage() @cache_on_self def elastic(self): return Elastic(index_name=self.index_name, id_key=self.id_key, time_field=self.time_field) def setup(self, delete_existing=False): elastic = self.elastic() elastic.connect() elastic.setup() if elastic.enabled: elastic.create_index_and_index_pattern( delete_existing=delete_existing) self.enabled = True return self # class methods def add_metadata(self, metadata): return self.elastic().add(metadata) def delete_metadata(self, original_hash): return self.elastic().delete(record_id=original_hash) @log_duration def delete_all_metadata(self): #log_debug(message=f"Deleting all data and recreating {self.index_name} index and index pattern") return self.setup(delete_existing=True) def get_all_metadata(self): return self.elastic().search_using_lucene('*') def get_metadata(self, original_hash): return self.elastic().get_data(record_id=original_hash) @log_duration def reload_metadatas(self): hash_json = Hash_Json().reset() hash_data = hash_json.data() metadatas = self.storage.hd2_metadatas() count = len(metadatas) log_debug(message=f"Reloading {count} currently in hd2/data") for metadata in metadatas: self.add_metadata(metadata) file_hash = metadata.get('original_hash') file_name = metadata.get('file_name') file_status = metadata.get('rebuild_status') hash_data[file_hash] = { "file_name": file_name, # todo: refactor this so that it is not done here "file_status": file_status } # (which happened due to the performance hit of the current Hash_Json file) # when using: # hash_json.add_file(file_hash=file_hash, file_name=file_name) # hash_json.update_status(index=file_hash, updated_status=file_status) hash_json.save() return count @log_duration def reload_hash_json(self): hash_json = Hash_Json().reset() hash_data = hash_json.data() metadatas = self.storage.hd2_metadatas() for metadata in metadatas: file_hash = metadata.get('original_hash') file_name = metadata.get('file_name') file_status = metadata.get('rebuild_status') hash_data[file_hash] = { "file_name": file_name, # todo: refactor this so that it is not done here "file_status": file_status } # (which happened due to the performance hit of the current Hash_Json file) hash_json.save() return f'Hash_Json reloaded for {len(metadatas)} metadata items' def reload_elastic_data(self): self.delete_all_metadata() count = self.reload_metadatas() return f'Elastic {self.index_name} has been reset and {count} metadata items reloaded' def reload_kibana_dashboards(self): kibana = self.elastic().kibana() dashboard_file_names = [ 'processed-files-v8.ndjson', 'File-Analysis-Threat-Level.ndjson' ] result = [] for dashboard_file_name in dashboard_file_names: result.append( kibana.dashboard_import_from_github( dashboard_file_name=dashboard_file_name)) return f"reloaded {len(result)} dashboards"
class File_Distributor: def __init__(self): self.config = Config() self.hd1_base_location = self.config.hd1_location self.hd2_base_location = self.config.hd2_location self.hd3_base_location = self.config.hd3_location self.zip_folder = os.path.join(os.getcwd(),"zip_folder") self.storage = Storage() folder_delete_all(self.zip_folder) create_folder(self.zip_folder) # def get_hd1_files(self,num_of_files): # try: # list=[] # count=0 # for folderName, subfolders, filenames in os.walk(self.hd1_base_location): # for filename in filenames: # self.hd1_path = os.path.join(folderName, filename) # if os.path.isfile(self.hd1_path): # list.append(self.hd1_path) # count=count+1 # if count == num_of_files : # break # if count == num_of_files: # break # target_file_path=self.prepare_zip(list,"hd1.zip") # return target_file_path # # except Exception as error: # logger.error(f"File_Distributor: get_hd1_files : {error}") # raise error # def get_hd3_files(self, num_of_files): # try: # list = [] # count = 0 # for folderName, subfolders, filenames in os.walk(self.hd3_base_location): # for filename in filenames: # self.hd3_path = os.path.join(folderName, filename) # if os.path.isfile(self.hd3_path): # list.append(self.hd3_path) # count = count + 1 # if count == num_of_files: # break # if count == num_of_files: # break # target_file_path=self.prepare_zip(list,"hd3.zip") # return target_file_path # # except Exception as error: # logger.error(f"File_Distributor: get_hd3_files : {error}") # raise error def get_hd2_status(self): try: base_path = self.storage.hd2_status() if not os.listdir(base_path) : return -1 target_file_path = self.prepare_zip(base_path, "hd2_status_files.zip") return target_file_path except Exception as error: logger.error(f"File_Distributor: get_hd2_status : {error}") raise error def get_hd2_data(self, num_of_files): try: base_path = self.storage.hd2_data() if num_of_files == 0: return 0 if not os.listdir(base_path): return -1 list = [] if num_of_files == -1: for folder in os.listdir(base_path): list.append(os.path.join(base_path, folder)) else: count = 0 for folder in os.listdir(base_path): list.append(os.path.join(base_path,folder)) count=count+1 if count == num_of_files: break target_file_path = self.prepare_hd2_hash_folder_zip(list,"hd2_data_files.zip") return target_file_path except Exception as error: logger.error(f"File_Distributor: get_hd2_data : {error}") raise error def get_hd2_processed(self, num_of_files): try: base_path = self.storage.hd2_processed() if num_of_files == 0: return 0 if not os.listdir(base_path): return -1 list = [] if num_of_files == -1: for folder in os.listdir(base_path): list.append(os.path.join(base_path, folder)) else: count = 0 for folder in os.listdir(base_path): list.append(os.path.join(base_path, folder)) count = count + 1 if count == num_of_files: break target_file_path = self.prepare_hd2_hash_folder_zip(list, "hd2_processed_files.zip") return target_file_path except Exception as error: logger.error(f"File_Distributor: get_hd2_processed : {error}") raise error def prepare_hd2_hash_folder_zip(self,path_list, zip_name): try: self.temp_folder = temp_folder() for hash_folder_path in path_list: name = ntpath.basename(hash_folder_path) dst_path = os.path.join(self.temp_folder, name) if os.path.isdir(hash_folder_path): folder_copy(hash_folder_path, dst_path) hd2_source_file = os.path.join(dst_path, "source") if os.path.isfile(hd2_source_file): file_delete(hd2_source_file) target_file_path = os.path.join(self.zip_folder, zip_name) zip_files(self.temp_folder, file_pattern='*.*', target_file = target_file_path) folder_delete_all(self.temp_folder) return target_file_path except Exception as error: logger.error(f"File_Distributor: prepare_zip : {error}") raise error def prepare_zip(self, path, zip_name): try: self.temp_folder = temp_folder() dst_path = os.path.join(self.temp_folder, ntpath.basename(path)) if os.path.isfile(path): file_copy(path, dst_path) elif os.path.isdir(path): folder_copy(path, dst_path) target_file_path = os.path.join(self.zip_folder, zip_name) zip_files(self.temp_folder, file_pattern='*.*', target_file = target_file_path) folder_delete_all(self.temp_folder) return target_file_path except Exception as error: logger.error(f"File_Distributor: prepare_zip : {error}") raise error
def setUp(self) -> None: self.config = Config() self.local_storage = Storage()
class Status: STATUS_FILE_NAME = "status.json" VAR_COMPLETED = "completed" VAR_CURRENT_STATUS = "current_status" VAR_FAILED = "failed" VAR_FILES_TO_PROCESS = "files_to_process" VAR_FILES_LEFT_TO_PROCESS = "files_left_to_process" VAR_FILES_COUNT = "files_in_hd1_folder" VAR_FILES_COPIED = "files_copied" VAR_FILES_TO_BE_COPIED = "files_left_to_be_copied" VAR_IN_PROGRESS = "in_progress" VAR_NUMBER_OF_CPUS = "number_of_cpus" VAR_CPU_UTILIZATION = "cpu_utilization" VAR_RAM_UTILIZATION = "memory_utilization" VAR_NUM_OF_PROCESSES = "number_of_processes" VAR_NUM_OF_THREADS = "number_of_threads" VAR_NETWORK_CONNECTIONS = "network_connections" VAR_DISK_PARTITIONS = "disk_partitions" lock = threading.Lock() _instance = None def __new__(cls): # singleton pattern if cls._instance is None: cls._instance = super(Status, cls).__new__(cls) return cls._instance def __init__(self): if hasattr(self, 'instantiated' ) is False: # only set these values first time around self.instantiated = True self.storage = Storage() #self._on_save = [] # todo: add support for firing up events when data is saved self._status_data = self.default_data() self.status_thread_on = False self.status_thread = threading.Thread() @classmethod def clear_instance(cls): del cls.instance def StatusThread(self, update_interval): while self.status_thread_on: self.get_server_status() sleep(update_interval) def StartStatusThread(self): if self.status_thread_on: return self.status_thread_on = True self.status_thread = threading.Thread(target=self.StatusThread, args=(1, )) self.status_thread.start() def StopStatusThread(self): self.status_thread_on = False self.status_thread.join() def data(self): return self._status_data def default_data(self): return { Status.VAR_CURRENT_STATUS: FileStatus.NONE, Status.VAR_FILES_COUNT: 0, Status.VAR_FILES_COPIED: 0, Status.VAR_FILES_TO_BE_COPIED: 0, Status.VAR_FILES_TO_PROCESS: 0, Status.VAR_FILES_LEFT_TO_PROCESS: 0, Status.VAR_COMPLETED: 0, Status.VAR_FAILED: 0, Status.VAR_IN_PROGRESS: 0, Status.VAR_NUMBER_OF_CPUS: psutil.cpu_count(), Status.VAR_CPU_UTILIZATION: None, Status.VAR_RAM_UTILIZATION: None, Status.VAR_NUM_OF_PROCESSES: None, Status.VAR_NUM_OF_THREADS: None, Status.VAR_NETWORK_CONNECTIONS: None, Status.VAR_DISK_PARTITIONS: len(psutil.disk_partitions()), } def load_data(self): self._status_data = json_load_file(self.status_file_path()) if self.data() == {}: self.reset() return self def reset(self): self._status_data = self.default_data() self.save() return self def save(self): if not file_exists(self.status_file_path()): folder_create(self.storage.hd2_status()) file_create(self.status_file_path()) json_save_file_pretty(self.data(), self.status_file_path()) return self def status_file_path(self): return path_combine(self.storage.hd2_status(), Status.STATUS_FILE_NAME) def get_server_data(self): self._status_data[Status.VAR_NUMBER_OF_CPUS] = psutil.cpu_count() self._status_data[Status.VAR_CPU_UTILIZATION] = psutil.cpu_percent( interval=1, percpu=True) self._status_data[ Status.VAR_RAM_UTILIZATION] = psutil.virtual_memory().percent pids = psutil.pids() self._status_data[Status.VAR_NUM_OF_PROCESSES] = len(pids) thread_count = 0 for pid in pids: try: p = psutil.Process(int(pid)) process_treads = p.num_threads() thread_count += process_treads except: pass self._status_data[Status.VAR_NUM_OF_THREADS] = thread_count self._status_data[Status.VAR_NETWORK_CONNECTIONS] = len( psutil.net_connections(kind='tcp')) self._status_data[Status.VAR_DISK_PARTITIONS] = len( psutil.disk_partitions()) def get_server_status(self): Status.lock.acquire() try: self.get_server_data() finally: Status.lock.release() self.save() return self def set_processing_status(self, processing_status): Status.lock.acquire() try: data = self.data() data[Status.VAR_CURRENT_STATUS] = processing_status finally: Status.lock.release() self.save() return self def set_started(self): return self.set_processing_status(Processing_Status.STARTED) def set_stopped(self): return self.set_processing_status(Processing_Status.STOPPED) def set_phase_1(self): return self.set_processing_status(Processing_Status.PHASE_1) def set_phase_2(self): return self.set_processing_status(Processing_Status.PHASE_2) def update_counters(self, updated_status, count=0): Status.lock.acquire() try: data = self.data() if updated_status == FileStatus.NONE: data[Status.VAR_FILES_COUNT] = count data[Status.VAR_FILES_TO_BE_COPIED] = count elif updated_status == FileStatus.INITIAL: data[Status.VAR_FILES_COPIED] += 1 if data[Status.VAR_FILES_TO_BE_COPIED] > 0: data[Status.VAR_FILES_TO_BE_COPIED] -= 1 elif updated_status == FileStatus.NOT_COPIED: if data[Status.VAR_FILES_TO_BE_COPIED] > 0: data[Status.VAR_FILES_TO_BE_COPIED] -= 1 elif updated_status == FileStatus.IN_PROGRESS: data[Status.VAR_IN_PROGRESS] += 1 elif updated_status == FileStatus.COMPLETED: data[Status.VAR_COMPLETED] += 1 if data[Status.VAR_IN_PROGRESS] > 0: data[Status.VAR_IN_PROGRESS] -= 1 if data[Status.VAR_FILES_LEFT_TO_PROCESS] > 0: data[Status.VAR_FILES_LEFT_TO_PROCESS] -= 1 elif updated_status == FileStatus.FAILED: data[Status.VAR_FAILED] += 1 if data[Status.VAR_IN_PROGRESS] > 0: data[Status.VAR_IN_PROGRESS] -= 1 if data[Status.VAR_FILES_LEFT_TO_PROCESS] > 0: data[Status.VAR_FILES_LEFT_TO_PROCESS] -= 1 elif updated_status == FileStatus.TO_PROCESS: data[Status.VAR_FILES_TO_PROCESS] += 1 data[Status.VAR_FILES_LEFT_TO_PROCESS] += 1 finally: Status.lock.release() self.save() return self def set_processing_counters(self, count): Status.lock.acquire() try: data = self.data() data[Status.VAR_IN_PROGRESS] = 0 data[Status.VAR_FAILED] = 0 data[Status.VAR_COMPLETED] = 0 data[Status.VAR_FILES_TO_PROCESS] = count data[Status.VAR_FILES_LEFT_TO_PROCESS] = count finally: Status.lock.release() self.save() return self def add_completed(self): return self.update_counters(FileStatus.COMPLETED) def add_failed(self): return self.update_counters(FileStatus.FAILED) def add_file(self): return self.update_counters(FileStatus.INITIAL) def set_files_count(self, count): return self.update_counters(FileStatus.NONE, count) def set_not_copied(self): return self.update_counters(FileStatus.NOT_COPIED) def add_in_progress(self): return self.update_counters(FileStatus.IN_PROGRESS) def add_to_be_processed(self): return self.update_counters(FileStatus.TO_PROCESS) def get_completed(self): return self.data().get(Status.VAR_COMPLETED) def get_current_status(self): return self.data().get(Status.VAR_CURRENT_STATUS) def get_failed(self): return self.data().get(Status.VAR_FAILED) def get_files_count(self): return self.data().get(Status.VAR_FILES_COUNT) def get_files_copied(self): return self.data().get(Status.VAR_FILES_COPIED) def get_files_to_process(self): return self.data().get(Status.VAR_FILES_TO_PROCESS) def get_in_progress(self): return self.data().get(Status.VAR_IN_PROGRESS)
class Pre_Processor: def __init__(self): self.config = Config() self.meta_service = Metadata_Service() self.status = Status() self.storage = Storage() self.file_name = None # set in process() method self.current_path = None self.base_folder = None self.dst_folder = None self.dst_file_name = None self.status = Status() self.status.reset() #self.analysis_json = Analysis_Json() @log_duration def clear_data_and_status_folders(self): data_target = self.storage.hd2_data( ) # todo: refactor this clean up to the storage class status_target = self.storage.hd2_status() processed_target = self.storage.hd2_processed() folder_delete_all(data_target) folder_delete_all(status_target) folder_delete_all(processed_target) folder_create(data_target) folder_create(status_target) folder_create(processed_target) self.status.reset() def file_hash(self, file_path): return self.meta_service.file_hash(file_path) def prepare_folder(self, folder_to_process): if folder_to_process.startswith(self.storage.hd1()): return folder_to_process dirname = os.path.join(self.storage.hd1(), os.path.basename(folder_to_process)) if os.path.isdir(dirname): folder_delete_all(dirname) try: folder_copy(folder_to_process, dirname) finally: return dirname def process_folder(self, folder_to_process): if not os.path.isdir(folder_to_process): # todo: add an event log return False folder_to_process = self.prepare_folder(folder_to_process) files_count = 0 for folderName, subfolders, filenames in os.walk(folder_to_process): for filename in filenames: file_path = os.path.join(folderName, filename) if os.path.isfile(file_path): files_count += 1 self.status.set_files_count(files_count) for folderName, subfolders, filenames in os.walk(folder_to_process): for filename in filenames: file_path = os.path.join(folderName, filename) if os.path.isfile(file_path): self.process(file_path) return True @log_duration def process_files(self): self.status.StartStatusThread() self.status.set_phase_1() self.process_folder(self.storage.hd1()) self.status.set_phase_2() self.status.StopStatusThread() @log_duration def process(self, file_path): tik = datetime.now() metadata = self.meta_service.create_metadata(file_path=file_path) file_name = metadata.get_file_name() original_hash = metadata.get_original_hash() status = metadata.get_rebuild_status() self.update_status(file_name, original_hash, status) tok = datetime.now() delta = tok - tik if metadata.is_in_todo(): hash_folder_path = self.storage.hd2_data(original_hash) self.meta_service.set_hd1_to_hd2_copy_time(hash_folder_path, delta.total_seconds()) else: self.status.set_not_copied() def update_status(self, file_name, original_hash, status): if status == FileStatus.INITIAL: self.status.add_file()
class Metadata: def __init__(self, file_hash=None): self.config = Config() self.storage = Storage() self.process_status = Status() self.metadata_utils = Metadata_Utils() self.path_hd1 = self.storage.hd1() self.data = self.default_data() self.file_hash = file_hash #self.time_field = def get_from_file(self): # todo: refactor out this method self.load() return self.data def load(self): with open(self.metadata_file_path()) as json_file: self.data = json.load(json_file) return self def add_file(self, file_path): if file_exists(file_path): tik = datetime.now() self.set_file_hash(self.metadata_utils.file_hash(file_path)) tok = datetime.now() delta = tok - tik self.set_file_hash_calculation_time(delta.total_seconds()) if self.exists(): self.get_from_file() else: self.create(file_path) self.add_file_path(file_path) self.save() return self.file_hash def add_file_path(self, file_path: str): if self.file_hash: file_paths = self.data.get('original_file_paths') if 0 == len(file_paths): self.process_status.add_to_be_processed() if file_path.startswith( self.path_hd1): # check if path starts with hd1 file_path = os.path.relpath(file_path, self.path_hd1) if file_path not in file_paths: file_paths.append(file_path) return file_paths def create(self, file_path): if self.file_hash: folder_create(self.metadata_folder_path()) file_copy(file_path, self.source_file_path()) self.set_original_file_size(file_path) self.set_original_file_extension(file_path) self.set_original_file_name(file_path) def default_data(self): return { 'file_name': None, 'xml_report_status': None, 'last_update_time': None, 'rebuild_server': None, 'server_version': None, 'error': None, 'original_file_paths': [], 'original_hash': None, 'original_hash_calculation_time': None, 'original_file_extension': None, 'original_file_size': None, 'rebuild_file_path': None, 'rebuild_hash': None, 'rebuild_status': FileStatus.INITIAL, 'rebuild_file_extension': None, 'rebuild_file_size': None, 'rebuild_file_duration': None, 'f2f_plugin_version': None, 'f2f_plugin_git_commit': None, 'hd1_to_hd2_copy_time': None, 'hd2_to_hd3_copy_time': None } def delete(self): if self.exists(): folder_delete_all(self.metadata_folder_path()) return self.exists() is False return False def exists(self): return folder_exists(self.metadata_folder_path()) def metadata_file_exists(self): return file_exists(self.metadata_file_path()) def metadata_file_path(self): if self.file_hash: # todo: find a better solution that having to add this to all methods return path_combine(self.metadata_folder_path(), DEFAULT_METADATA_FILENAME) def metadata_folder_path(self): if not self.file_hash: return path = self.storage.hd2_not_processed(self.file_hash) if folder_exists(path): return path path = self.storage.hd2_processed(self.file_hash) if folder_exists(path): return path # never processed - must be in the 'todo' folder path = self.storage.hd2_data(self.file_hash) return path def is_in_todo(self): folder_exists(self.storage.hd2_data(self.file_hash)) def is_in_processed(self): folder_exists(self.storage.hd2_processed(self.file_hash)) def is_in_not_processed(self): folder_exists(self.storage.hd2_not_processed(self.file_hash)) def save(self): if self.exists(): json_save_file_pretty(python_object=self.data, path=self.metadata_file_path()) def update_field( self, field, updated_value ): # todo: optimise this if we get performance hits due to multiple updates self.data[field] = updated_value self.data['last_update_time'] = datetime_now() self.save() def set_file_hash(self, file_hash): self.file_hash = file_hash self.data['original_hash'] = file_hash self.data['last_update_time'] = datetime_now() if not self.exists(): self.save() def set_file_hash_calculation_time(self, seconds): self.data['original_hash_calculation_time'] = seconds def set_original_file_name(self, file_path): original_file_name = file_name(file_path) self.update_field('file_name', original_file_name) def set_original_file_size(self, file_path): file_size = os.path.getsize(file_path) self.update_field('original_file_size', file_size) def set_original_file_extension(self, file_path): extension = pathlib.Path(file_path).suffix self.update_field('original_file_extension', extension) def source_file_path(self): if self.file_hash: return path_combine(self.metadata_folder_path(), DEFAULT_SOURCE_FILENAME) def get_original_hash(self): return self.data.get('original_hash') def get_file_hash(self): return self.file_hash def get_file_name(self): return self.data.get('file_name') def get_rebuild_status(self): return self.data.get('rebuild_status') def get_original_file_paths(self): return self.data.get('original_file_paths') def get_last_update_time(self): return self.data.get('last_update_time') def get_error(self): return self.data.get('error') def get_original_file_extension(self): return self.data.get('original_file_extension') def report_file_path(self): if self.file_hash: return path_combine(self.metadata_folder_path(), DEFAULT_REPORT_FILENAME) def report_file_exists(self): return file_exists(self.report_file_path())