def update_report(self, index, report_json): try: self.get_from_file() self.analysis_data[index]["original_hash"] = index self.analysis_data[index]["file_type"] = report_json[ "gw:GWallInfo"]["gw:DocumentStatistics"]["gw:DocumentSummary"][ "gw:FileType"] self.analysis_data[index]["file_size"] = report_json[ "gw:GWallInfo"]["gw:DocumentStatistics"]["gw:DocumentSummary"][ "gw:TotalSizeInBytes"] self.analysis_data[index]["remediated_item_count"], \ self.analysis_data[index]["remediate_items_list"] = self.get_remediated_item_details(report_json) self.analysis_data[index]["sanitised_item_count"], \ self.analysis_data[index]["sanitised_items_list"] = self.get_sanitisation_item_details(report_json) self.analysis_data[index]["issue_item_count"],\ self.analysis_data[index]["issue_item_list"] = self.get_issue_item_details(report_json) self.write_to_file() except Exception as error: log_error( message= f"Error in update_report from json data {index} : {error}")
def get_xmlreport(self, endpoint, fileId, dir): log_info(message=f"getting XML Report for {fileId} at {endpoint}") xmlreport = self.xmlreport_request(endpoint, fileId) if not xmlreport: raise ValueError('Failed to obtain the XML report') try: json_obj = xmltodict.parse(xmlreport) file_extension = json_obj["gw:GWallInfo"]["gw:DocumentStatistics"][ "gw:DocumentSummary"]["gw:FileType"] self.meta_service.set_rebuild_file_extension(dir, file_extension) json_obj['original_hash'] = os.path.basename(dir) json_save_file_pretty(json_obj, os.path.join(dir, "report.json")) #self.report_elastic.add_report(json_obj) analysis_obj = self.analysis_json.get_file_analysis( os.path.basename(dir), json_obj) json_save_file_pretty(analysis_obj, os.path.join(dir, "analysis.json")) self.analysis_elastic.add_analysis(analysis_obj) return True except Exception as error: log_error( message=f"Error in parsing xmlreport for {fileId} : {error}") return False
def get_valid_endpoints(self, endpoint_string): self.reset_last_error() try: valid_endpoints = {'Endpoints': []} endpoint_json = json.loads(endpoint_string) endpoint_count = len(endpoint_json['Endpoints']) for idx in range(endpoint_count): server_url = "http://" + endpoint_json['Endpoints'][idx]['IP'] + ":" + \ endpoint_json['Endpoints'][idx]['Port'] response = self.gw_sdk_healthcheck(server_url) if response: if response.status_code == 200: valid_endpoints['Endpoints'].append( endpoint_json['Endpoints'][idx]) valid_endpoints_count = len(valid_endpoints['Endpoints']) if valid_endpoints_count == 0: return None return json.dumps(valid_endpoints) except Exception as e: self.last_error_message = f'Configure_Env : get_valid_endpoints : {e}' log_error(f'Configure_Env : get_valid_endpoints : {e}') raise ValueError(str(e))
def test__start_logging(self): # todo: understand better why this test takes about 1.1 secs to execute (some of it is caused by the processing process starting, and elastic being setup) log_worker = start_logging() # trigger logging process log_info() # send 4 log messages log_warning() log_info(message=random_text(), data={'a': 42}) log_error(message='an error')
def ProcessDirectoryWithEndpoint(self, itempath, file_hash, endpoint_index): if not os.path.isdir(itempath): return False log_info( message= f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}" ) meta_service = Metadata_Service() original_file_path = meta_service.get_original_file_paths(itempath) events = Events_Log(itempath) endpoint = "http://" + self.config.endpoints['Endpoints'][ endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][ endpoint_index]['Port'] events.add_log("Processing with: " + endpoint) meta_service.set_f2f_plugin_version(itempath, API_VERSION) meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit()) try: file_processing = File_Processing(events, self.events_elastic, self.report_elastic, self.analysis_elastic, meta_service) if not file_processing.processDirectory(endpoint, itempath): events.add_log("CANNOT be processed") return False log_data = { 'file': original_file_path, 'status': FileStatus.COMPLETED, 'error': 'none', 'timestamp': datetime.now(), } log_info('ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, "none") meta_service.set_status(itempath, FileStatus.COMPLETED) self.hash_json.update_status(file_hash, FileStatus.COMPLETED) events.add_log("Has been processed") return True except Exception as error: log_data = { 'file': original_file_path, 'status': FileStatus.FAILED, 'error': str(error), } log_error(message='error in ProcessDirectoryWithEndpoint', data=log_data) meta_service.set_error(itempath, str(error)) meta_service.set_status(itempath, FileStatus.FAILED) self.hash_json.update_status(file_hash, FileStatus.FAILED) events.add_log("ERROR:" + str(error)) return False
def ProcessSingleFile(self): if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True)) return True
def LoopHashDirectoriesSequential(self): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( "ERROR: Attempt to start processing while processing is in progress" ) return False loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(1)) return True
def env_details(self): self.reset_last_error() try: return { "hd1_path": environ.get('HD1_LOCATION'), "hd2_path": environ.get('HD2_LOCATION'), "hd3_path": environ.get('HD3_LOCATION') } except Exception as e: self.last_error_message = f'Configure_Env : env_details : {e}' log_error(f'Configure_Env : env_details : {e}') raise ValueError(str(e))
def gw_sdk_healthcheck(self, server_url): self.reset_last_error() try: api_route = "api/health/" url = urljoin(server_url, api_route) response = requests.request("GET", url, verify=False, timeout=10) return response except Exception as e: self.last_error_message = f'Configure_Env : gw_sdk_healthcheck : {e}' log_error(f'Configure_Env : gw_sdk_healthcheck : {e}') return None
def add_file(self, file_hash, file_name): if self.is_hash(file_hash) and file_name: json_value = {"file_name": file_name} json_data = {file_hash: json_value} self.analysis_data.update(json_data) self.write_to_file() return True log_error(message='in Analysis_Json.add_file bad data provided', data={ 'file_hash': file_hash, 'file_name': file_name }) return False
def base64request(self, endpoint, api_route, base64enc_file): try: url = endpoint + "/" + api_route payload = json.dumps({"Base64": base64enc_file}) headers = {'Content-Type': 'application/json'} return requests.request("POST", url, headers=headers, data=payload, timeout=int(self.config.request_timeout)) except Exception as e: log_error(str(e)) raise ValueError(str(e))
def LoopHashDirectories(self, thread_count=None): #Allow only a single loop to be run at a time if self.IsProcessing(): log_error( message= "ERROR: Attempt to start processing while processing is in progress" ) return False self.status.StartStatusThread() thread_count = thread_count or self.config.thread_count log_info(message="in LoopHashDirectories, about to start main loop") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count)) log_info(message="in LoopHashDirectories, Loop completed") self.status.StopStatusThread() return True
def get_file_analysis(self, index, report_json): try: meta_service = Metadata_Service() metadata = meta_service.get_from_file(index) self.file_analysis_data = {} self.file_analysis_data["file_name"] = metadata.data.get( 'file_name') self.file_analysis_data[ "rebuild_file_extension"] = metadata.data.get( 'rebuild_file_extension') self.file_analysis_data["rebuild_file_size"] = metadata.data.get( 'rebuild_file_size') self.file_analysis_data["original_hash"] = index self.file_analysis_data["rebuild_hash"] = metadata.data.get( 'rebuild_hash') self.file_analysis_data["file_type"] = report_json["gw:GWallInfo"][ "gw:DocumentStatistics"]["gw:DocumentSummary"]["gw:FileType"] self.file_analysis_data["file_size"] = int( report_json["gw:GWallInfo"]["gw:DocumentStatistics"] ["gw:DocumentSummary"]["gw:TotalSizeInBytes"]) self.file_analysis_data["remediated_item_count"], \ self.file_analysis_data["remediate_items_list"] = self.get_remediated_item_details(report_json) self.file_analysis_data["sanitised_item_count"], \ self.file_analysis_data["sanitised_items_list"] = self.get_sanitisation_item_details(report_json) self.file_analysis_data["issue_item_count"],\ self.file_analysis_data["issue_item_list"] = self.get_issue_item_details(report_json) self.file_analysis_data[ "threat_analysis"] = self.get_threat_analysis( self.file_analysis_data["sanitised_items_list"]) return self.file_analysis_data except Exception as error: log_error( message= f"Error in get_file_analysis from json data {dir} : {error}")
def configure(self, hd1_path=None, hd2_path=None, hd3_path=None): self.reset_last_error() try: dotenv_file = dotenv.find_dotenv() if hd1_path: if path.exists(hd1_path): environ['HD1_LOCATION'] = hd1_path dotenv.set_key(dotenv_file, "HD1_LOCATION", environ["HD1_LOCATION"]) else: self.last_error_message = f"hd1_path did not exist: {hd1_path}" log_error(message=f"hd1_path did not exist", data={"path": hd1_path}) return -1 if hd2_path: if not path.exists(hd2_path): folder_create(hd2_path) folder_create(path_combine(hd2_path, DEFAULT_HD2_DATA_NAME)) folder_create( path_combine(hd2_path, DEFAULT_HD2_STATUS_NAME)) environ['HD2_LOCATION'] = hd2_path dotenv.set_key(dotenv_file, "HD2_LOCATION", environ["HD2_LOCATION"]) if hd3_path: if not path.exists(hd3_path): folder_create(hd3_path) environ['HD3_LOCATION'] = hd3_path dotenv.set_key(dotenv_file, "HD3_LOCATION", environ["HD3_LOCATION"]) self.config.load_values() return self.env_details() except Exception as e: self.last_error_message = f'Configure_Env : configure : {e}' log_error(f'Configure_Env : configure : {e}') raise ValueError(str(e))
def configure_endpoints(self, endpoint_string): self.reset_last_error() try: dotenv_file = dotenv.find_dotenv() valid_endpoint_string = self.get_valid_endpoints(endpoint_string) if valid_endpoint_string: environ['ENDPOINTS'] = valid_endpoint_string logger.info(f"ENDPOINTS : {environ['ENDPOINTS']}") dotenv.set_key(dotenv_file, "ENDPOINTS", environ["ENDPOINTS"]) self.config.load_values() return json.loads(environ['ENDPOINTS']) else: self.last_error_message = f"No valid endpoint found in: {endpoint_string}" log_error(f"No valid endpoint found in", data={"enpoints": endpoint_string}) return -1 except Exception as e: self.last_error_message = f'Configure_Env : configure_endpoints : {e}' log_error(f'Configure_Env : configure_endpoints : {e}') raise ValueError(str(e))
def add_file(self, file_hash, file_name): if self.is_hash(file_hash) and file_name: Hash_Json.lock.acquire() try: json_value = { "file_name": file_name, "file_status": FileStatus.INITIAL } json_data = {file_hash: json_value} self.data().update(json_data) finally: Hash_Json.lock.release() return True log_error(message='in Hash_Json.add_file bad data provided', data={ 'file_hash': file_hash, 'file_name': file_name }) return False
def do_rebuild(self, endpoint, hash, source_path, dir): log_info( message=f"Starting rebuild for file {hash} on endpoint {endpoint}") with Duration() as duration: event_data = { "endpoint": endpoint, "hash": hash, "source_path": source_path, "dir": dir } # todo: see if we can use a variable that holds the params data self.add_event_log('Starting File rebuild', event_data) self.meta_service.set_rebuild_server(dir, endpoint) encodedFile = FileService.base64encode(source_path) if not encodedFile: message = f"Failed to encode the file: {hash}" log_error(message=message) self.add_event_log(message) self.meta_service.set_error(dir, message) return False response = self.rebuild(endpoint, encodedFile) result = response.text if not result: message = f"Failed to rebuild the file : {hash}" log_error(message=message) self.add_event_log(message) self.meta_service.set_error(dir, message) return False try: for path in self.meta_service.get_original_file_paths(dir): #rebuild_file_path = path if path.startswith(self.config.hd1_location): rebuild_file_path = path.replace( self.config.hd1_location, self.config.hd3_location) else: rebuild_file_path = os.path.join( self.config.hd3_location, path) folder_create(parent_folder( rebuild_file_path)) # make sure parent folder exists final_rebuild_file_path = self.save_file( result, rebuild_file_path ) # returns actual file saved (which could be .html) # todo: improve the performance of these update since each will trigger a save file_size = os.path.getsize( final_rebuild_file_path) # calculate rebuilt file fize rebuild_hash = self.meta_service.file_hash( final_rebuild_file_path ) # calculate hash of final_rebuild_file_path self.meta_service.set_rebuild_file_size(dir, file_size) self.meta_service.set_rebuild_file_path( dir, final_rebuild_file_path ) # capture final_rebuild_file_path self.meta_service.set_rebuild_hash( dir, rebuild_hash) # capture it if not FileService.base64decode(result): message = f"Engine response could not be decoded" log_error(message=message, data=f"{result}") self.meta_service.set_error(dir, message) return False except Exception as error: message = f"Error Saving file for {hash} : {error}" log_error(message=message) self.meta_service.set_xml_report_status(dir, "No Report") self.meta_service.set_error(dir, message) return False headers = response.headers fileIdKey = "X-Adaptation-File-Id" # get XML report if fileIdKey in headers: if self.get_xmlreport(endpoint, headers[fileIdKey], dir): self.add_event_log('The XML report has been saved') self.meta_service.set_xml_report_status(dir, "Obtained") else: self.meta_service.set_xml_report_status( dir, "No XML Report") else: self.meta_service.set_xml_report_status( dir, "Failed to obtain") message = f'No X-Adaptation-File-Id header found in the response for {hash}' log_error(message) self.add_event_log(message) self.meta_service.set_error(dir, message) return False #raise ValueError("No X-Adaptation-File-Id header found in the response") # todo: add when server side supports this # SDKEngineVersionKey = "X-SDK-Engine-Version" # SDKAPIVersionKey = "X-SDK-Api-Version" # # if SDKEngineVersionKey in headers: # self.sdk_engine_version = headers[SDKEngineVersionKey] # if SDKAPIVersionKey in headers: # self.sdk_api_version = headers[SDKAPIVersionKey] # # self.meta_service.set_server_version(dir, "Engine:" + self.sdk_engine_version + " API:" + self.sdk_api_version ) log_info( message= f"rebuild ok for file {hash} on endpoint {endpoint} took {duration.seconds()} seconds" ) return True
def LoopHashDirectoriesInternal(self, thread_count, do_single): if folder_exists(self.storage.hd2_data()) is False: log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data( ) log_error(log_message) return False if not isinstance(thread_count, int): raise TypeError("thread_count must be a integer") if not isinstance(do_single, bool): raise TypeError("thread_count must be a integer") log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) json_list = self.updateHashJson() log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads" self.events.add_log(log_message) log_info(log_message) threads = list() process_index = 0 log_info( message=f'before Mapping thread_data for {len(json_list)} files') thread_data = [] for key in json_list: file_hash = key itempath = self.storage.hd2_data(key) if (FileStatus.COMPLETED == json_list[key]["file_status"]): self.events.add_log( f"The file processing has been already completed") continue if not os.path.exists(itempath): self.events.add_log( f"ERROR: Path \"{itempath}\" does not exist") json_list[key]["file_status"] = FileStatus.FAILED continue process_index += 1 thread_data.append(( itempath, file_hash, process_index, )) # # limit the number of parallel threads # # if process_index % int(thread_count) == 0: # todo: refactor this workflow to use multiprocess and queues # # Clean up the threads # for index, thread in enumerate(threads): # todo: since at the moment this will block allocating new threads until # thread.join() # all have finishing execution # # process_index += 1 # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }") # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,)) # threads.append(x) # x.start() # # if do_single: # break # # if not Loops.continue_processing: # break # for index, thread in enumerate(threads): # thread.join() log_info( message= f'after mapped thread_data, there are {len(thread_data)} mapped items' ) #thread_data = thread_data[:500] #log_info(message=f'to start with only processing {len(thread_data)} thread_data items') pool = ThreadPool(thread_count) results = pool.map(self.ProcessDirectory, thread_data) pool.close() pool.join() self.moveProcessedFiles() self.events.add_log("LoopHashDirectoriesInternal finished") return True