Example #1
0
 def __init__(self, file_hash=None):
     self.config = Config()
     self.storage = Storage()
     self.process_status = Status()
     self.metadata_utils = Metadata_Utils()
     self.path_hd1 = self.storage.hd1()
     self.data = self.default_data()
     self.file_hash = file_hash
 def __init__(self):
     if hasattr(self, 'instantiated'
                ) is False:  # only set these values first time around
         self.instantiated = True
         self.storage = Storage()
         #self._on_save      = []                             # todo: add support for firing up events when data is saved
         self._status_data = self.default_data()
         self.status_thread_on = False
         self.status_thread = threading.Thread()
    def __init__(self):
        self.config             = Config()
        self.hd1_base_location  = self.config.hd1_location
        self.hd2_base_location  = self.config.hd2_location
        self.hd3_base_location  = self.config.hd3_location
        self.zip_folder         = os.path.join(os.getcwd(),"zip_folder")
        self.storage            = Storage()

        folder_delete_all(self.zip_folder)
        create_folder(self.zip_folder)
Example #4
0
    def __init__(self):
        self.config = Config()
        self.meta_service = Metadata_Service()
        self.status = Status()
        self.storage = Storage()
        self.file_name = None  # set in process() method
        self.current_path = None
        self.base_folder = None
        self.dst_folder = None
        self.dst_file_name = None

        self.status = Status()
        self.status.reset()
Example #5
0
 def __init__(self):
     self.use_es = False
     self.config = Config()
     self.status = Status()
     self.storage = Storage()
     self.hash_json = Hash_Json()
     self.events = Events_Log(self.config.hd2_status_location)
     self.events_elastic = Events_Log_Elastic()
     self.hash = None
     self.report_elastic = Report_Elastic()
     self.analysis_elastic = Analysis_Elastic()
     self.report_elastic.setup()
     self.analysis_elastic.setup()
     create_folder(self.storage.hd2_processed())
     create_folder(self.storage.hd2_not_processed())
Example #6
0
 def __init__(self):
     self.config = Config()
     self.storage = Storage()
     self.folder = os.path.join(self.config.hd2_location, "status")
     self.analysis_data = {}
     self.id = 0
     self.get_from_file()
Example #7
0
 def __init__(self):
     if hasattr(self, '_hash_json_data'
                ) is False:  # only set these values first time around
         self.config = Config()
         self.storage = Storage()
         self._hash_json_data = {}
         self.load()
Example #8
0
class test_Storage(Temp_Config):

    def setUp(self) -> None:
        self.config        = Config()
        self.local_storage = Storage()

    def test_hd1_hd2_hd3(self):
        assert self.local_storage.hd1() == abspath(self.config.hd1_location)
        assert self.local_storage.hd2() == abspath(self.config.hd2_location)
        assert self.local_storage.hd3() == abspath(self.config.hd3_location)

    def test_hd1_add_file(self):
        test_file      = temp_file(contents=random_text())
        test_file_name = file_name(test_file)
        file_in_hd1    = self.storage.hd1_add_file(test_file)
        assert file_exists(file_in_hd1)
        assert file_contents(file_in_hd1) == file_contents(test_file)
        assert self.storage.hd1_file_path(test_file_name) == file_in_hd1

    def test_hd1_files(self):
        new_files = self.add_test_files(count=2)
        hd1_files = self.storage.hd1_files()
        assert len(hd1_files) >= len(new_files)
        assert new_files[0] in hd1_files
        assert new_files[1] in hd1_files

    def test_hd2_metadatas(self):
        self.add_test_files(count=10, text_size=1000, execute_stage_1=True)
        metadatas = self.storage.hd2_metadatas()
        assert list_set(metadatas[0]) == [ 'error', 'f2f_plugin_git_commit', 'f2f_plugin_version', 'file_name', 'hd1_to_hd2_copy_time', 'hd2_to_hd3_copy_time', 'last_update_time', 'original_file_extension', 'original_file_paths', 'original_file_size', 'original_hash', 'original_hash_calculation_time', 'rebuild_file_duration', 'rebuild_file_extension', 'rebuild_file_path', 'rebuild_file_size', 'rebuild_hash', 'rebuild_server', 'rebuild_status', 'server_version', 'xml_report_status']

    @pytest.mark.skip("needs more work to be solid")
    def test_hd3_files(self):
        count = 1
        self.add_test_files(count=count, execute_stage_1=True)
        loops     = Loops()
        result    = loops.LoopHashDirectories()
        metadatas = self.storage.hd2_metadatas()
        hd3_files = self.storage.hd3_files()
        metadata  = metadatas[0]
        hd3_file  = hd3_files[0]

        assert result is True
        assert len(hd3_files) == count
        assert len(metadatas) == count
        assert b'Glasswall Processed' in file_contents_as_bytes(hd3_file)
        assert metadata.get('rebuild_status') == 'Completed Successfully'
 def setUpClass(cls) -> None:
     cls.setup_testing = Setup_Testing()
     cls.log_worker = start_logging()
     cls.storage = Storage()
     cls.config = cls.storage.config
     from cdr_plugin_folder_to_folder.utils.Logging import log_info
     log_info(message='in Temp_Config')
     cls.setup_testing.set_config_to_temp_folder()
    def setUp(self) -> None:

        self.sdk_server      = self.config.test_sdk
        self.sdk_port        = '8080'
        self.temp_folder     = temp_folder()
        self.events_log      = Events_Log(self.temp_folder)
        self.events_elastic  = Events_Log_Elastic()
        self.report_elastic  = Report_Elastic()
        self.analysis_elastic = Analysis_Elastic()
        self.file_processing = File_Processing(events_log=self.events_log, events_elastic = self.events_elastic, report_elastic=self.report_elastic, analysis_elastic= self.analysis_elastic, meta_service=self.meta_service )
        self.storage         = Storage()
Example #11
0
    def __init__(self, events_log, events_elastic, report_elastic,
                 analysis_elastic, meta_service):
        self.meta_service = meta_service
        self.events_log = events_log
        self.events_elastic = events_elastic
        self.storage = Storage()
        self.config = Config()
        self.status = Status()
        self.hash_json = Hash_Json()
        self.report_elastic = report_elastic
        self.sdk_api_version = "Not available"
        self.sdk_engine_version = "Not available"

        self.analysis_json = Analysis_Json()
        self.analysis_elastic = analysis_elastic
Example #12
0
    def test_set_config_to_temp_folder__restore_config(self):
        storage         = Storage()
        config          = storage.config
        original_config = config.values()
        self.setup_testing.set_config_to_temp_folder()
        temp_config     = config.values()

        assert parent_folder(config.root_folder  ) == temp_folder_current()
        assert folder_exists(config.root_folder  )
        assert folder_exists(storage.hd1()       )
        assert folder_exists(storage.hd2_status())
        assert folder_exists(storage.hd2_data()  )
        assert folder_exists(storage.hd3()       )
        assert original_config != temp_config

        self.setup_testing.restore_config()
        #self.setup_testing.configure_static_logging()
        assert original_config == config.values()
        assert parent_folder(config.root_folder) != temp_folder_current()
        assert folder_not_exists(temp_config.get('root_folder'))
Example #13
0
class Loops(object):

    continue_processing = False
    processing_started = False
    lock = asyncio.Lock()

    def __init__(self):
        self.use_es = False
        self.config = Config()
        self.status = Status()
        self.storage = Storage()
        self.hash_json = Hash_Json()
        self.events = Events_Log(self.config.hd2_status_location)
        self.events_elastic = Events_Log_Elastic()
        self.hash = None
        self.report_elastic = Report_Elastic()
        self.analysis_elastic = Analysis_Elastic()
        self.report_elastic.setup()
        self.analysis_elastic.setup()
        create_folder(self.storage.hd2_processed())
        create_folder(self.storage.hd2_not_processed())

    def IsProcessing(self):
        return Loops.processing_started

    def StopProcessing(self):
        Loops.continue_processing = False

    def HasBeenStopped(self):
        return not Loops.continue_processing

    def git_commit(self):
        git_commit = 'Not available'
        try:
            git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'
                                                  ]).decode("utf-8").rstrip()
        except Exception as e:
            pass

        return git_commit

    def ProcessDirectoryWithEndpoint(self, itempath, file_hash,
                                     endpoint_index):

        if not os.path.isdir(itempath):
            return False

        log_info(
            message=
            f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}"
        )
        meta_service = Metadata_Service()
        original_file_path = meta_service.get_original_file_paths(itempath)
        events = Events_Log(itempath)

        endpoint = "http://" + self.config.endpoints['Endpoints'][
            endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][
                endpoint_index]['Port']
        events.add_log("Processing with: " + endpoint)

        meta_service.set_f2f_plugin_version(itempath, API_VERSION)
        meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit())

        try:
            file_processing = File_Processing(events, self.events_elastic,
                                              self.report_elastic,
                                              self.analysis_elastic,
                                              meta_service)
            if not file_processing.processDirectory(endpoint, itempath):
                events.add_log("CANNOT be processed")
                return False

            log_data = {
                'file': original_file_path,
                'status': FileStatus.COMPLETED,
                'error': 'none',
                'timestamp': datetime.now(),
            }
            log_info('ProcessDirectoryWithEndpoint', data=log_data)
            meta_service.set_error(itempath, "none")
            meta_service.set_status(itempath, FileStatus.COMPLETED)
            self.hash_json.update_status(file_hash, FileStatus.COMPLETED)
            events.add_log("Has been processed")
            return True
        except Exception as error:
            log_data = {
                'file': original_file_path,
                'status': FileStatus.FAILED,
                'error': str(error),
            }
            log_error(message='error in ProcessDirectoryWithEndpoint',
                      data=log_data)
            meta_service.set_error(itempath, str(error))
            meta_service.set_status(itempath, FileStatus.FAILED)
            self.hash_json.update_status(file_hash, FileStatus.FAILED)
            events.add_log("ERROR:" + str(error))
            return False

    def ProcessDirectory(self, thread_data):
        (itempath, file_hash, process_index) = thread_data
        endpoint_index = process_index % self.config.endpoints_count
        if not Loops.continue_processing:
            return False
        tik = datetime.now()
        process_result = self.ProcessDirectoryWithEndpoint(
            itempath, file_hash, endpoint_index)

        if process_result:
            self.status.add_completed()

            tok = datetime.now()
            delta = tok - tik

            meta_service = Metadata_Service()
            meta_service.set_hd2_to_hd3_copy_time(itempath,
                                                  delta.total_seconds())
        else:
            self.status.add_failed()

        return process_result

        # note: removing retries from this method (it should not be handled like this
        #for idx in range(self.config.endpoints_count):
        #    if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index):
        #        return
        #    # The Endpoint failed to process the file
        #    # Retry it with the next one
        #    endpoint_index = (endpoint_index + 1) % self.config.endpoints_count

    def updateHashJson(self):
        self.hash_json.reset()
        meta_service = Metadata_Service()

        for hash_folder in os.listdir(self.storage.hd2_data()):

            metadata_folder = self.storage.hd2_data(hash_folder)

            if not os.path.isdir(metadata_folder):
                continue

            metadata = meta_service.get_from_file(metadata_folder)
            file_name = metadata.get_file_name()
            original_hash = metadata.get_original_hash()
            status = metadata.get_rebuild_status()

            if status != FileStatus.COMPLETED:
                self.hash_json.add_file(original_hash, file_name)

        self.hash_json.save()
        self.status.set_processing_counters(len(self.hash_json.data()))
        return self.hash_json.data()

    def moveProcessedFiles(self):
        json_list = self.hash_json.data()

        for key in json_list:

            source_path = self.storage.hd2_data(key)

            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                destination_path = self.storage.hd2_processed(key)

                if folder_exists(destination_path):
                    folder_delete_all(destination_path)

                shutil.move(source_path, destination_path)

            if (FileStatus.FAILED == json_list[key]["file_status"]):

                meta_service = Metadata_Service()
                meta_service.get_from_file(source_path)
                metadata = meta_service.metadata
                if ("Engine response could not be decoded" == metadata.get_error()) and \
                    metadata.get_original_file_extension() in ['.xml', '.json']:
                    destination_path = self.storage.hd2_not_processed(key)

                    if folder_exists(destination_path):
                        folder_delete_all(destination_path)

                    shutil.move(source_path, destination_path)

    def LoopHashDirectoriesInternal(self, thread_count, do_single):

        if folder_exists(self.storage.hd2_data()) is False:
            log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data(
            )
            log_error(log_message)
            return False

        if not isinstance(thread_count, int):
            raise TypeError("thread_count must be a integer")

        if not isinstance(do_single, bool):
            raise TypeError("thread_count must be a integer")

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        json_list = self.updateHashJson()

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        threads = list()

        process_index = 0

        log_info(
            message=f'before Mapping thread_data for {len(json_list)} files')
        thread_data = []
        for key in json_list:
            file_hash = key

            itempath = self.storage.hd2_data(key)
            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                self.events.add_log(
                    f"The file processing has been already completed")
                continue

            if not os.path.exists(itempath):
                self.events.add_log(
                    f"ERROR: Path \"{itempath}\" does not exist")
                json_list[key]["file_status"] = FileStatus.FAILED
                continue

            process_index += 1
            thread_data.append((
                itempath,
                file_hash,
                process_index,
            ))
            # # limit the number of parallel threads
            #
            # if process_index % int(thread_count) == 0:                      # todo: refactor this workflow to use multiprocess and queues
            #     # Clean up the threads
            #     for index, thread in enumerate(threads):                    # todo: since at the moment this will block allocating new threads until
            #         thread.join()                                           #       all have finishing execution
            #
            # process_index += 1
            # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }")
            # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,))
            # threads.append(x)
            # x.start()
            #
            # if do_single:
            #     break
            #
            # if not Loops.continue_processing:
            #     break

        # for index, thread in enumerate(threads):
        #     thread.join()

        log_info(
            message=
            f'after mapped thread_data, there are {len(thread_data)} mapped items'
        )
        #thread_data = thread_data[:500]
        #log_info(message=f'to start with only processing {len(thread_data)} thread_data items')
        pool = ThreadPool(thread_count)
        results = pool.map(self.ProcessDirectory, thread_data)
        pool.close()
        pool.join()

        self.moveProcessedFiles()

        self.events.add_log("LoopHashDirectoriesInternal finished")
        return True

    async def LoopHashDirectoriesAsync(self, thread_count, do_single=False):
        await Loops.lock.acquire()
        try:
            Loops.continue_processing = True
            Loops.processing_started = True
            self.status.set_started()
            self.LoopHashDirectoriesInternal(thread_count, do_single)
        finally:
            Loops.processing_started = False
            Loops.lock.release()
            self.status.set_stopped()
            self.hash_json.save()

    @log_duration
    def LoopHashDirectories(self, thread_count=None):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                message=
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        self.status.StartStatusThread()
        thread_count = thread_count or self.config.thread_count
        log_info(message="in LoopHashDirectories, about to start main loop")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count))
        log_info(message="in LoopHashDirectories, Loop completed")
        self.status.StopStatusThread()
        return True

    @log_duration
    def LoopHashDirectoriesSequential(self):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1))
        return True

    @log_duration
    def ProcessSingleFile(self):
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True))
        return True
 def __init__(self):
     self.index_name = 'files_metadata'
     self.id_key = 'original_hash'
     self.time_field = DEFAULT_TIME_FIELD
     self.enabled = False
     self.storage = Storage()
class Metadata_Elastic:
    def __init__(self):
        self.index_name = 'files_metadata'
        self.id_key = 'original_hash'
        self.time_field = DEFAULT_TIME_FIELD
        self.enabled = False
        self.storage = Storage()

    @cache_on_self
    def elastic(self):
        return Elastic(index_name=self.index_name,
                       id_key=self.id_key,
                       time_field=self.time_field)

    def setup(self, delete_existing=False):
        elastic = self.elastic()
        elastic.connect()
        elastic.setup()
        if elastic.enabled:
            elastic.create_index_and_index_pattern(
                delete_existing=delete_existing)
            self.enabled = True
        return self

    # class methods

    def add_metadata(self, metadata):
        return self.elastic().add(metadata)

    def delete_metadata(self, original_hash):
        return self.elastic().delete(record_id=original_hash)

    @log_duration
    def delete_all_metadata(self):
        #log_debug(message=f"Deleting all data and recreating {self.index_name} index and index pattern")
        return self.setup(delete_existing=True)

    def get_all_metadata(self):
        return self.elastic().search_using_lucene('*')

    def get_metadata(self, original_hash):
        return self.elastic().get_data(record_id=original_hash)

    @log_duration
    def reload_metadatas(self):
        hash_json = Hash_Json().reset()
        hash_data = hash_json.data()
        metadatas = self.storage.hd2_metadatas()
        count = len(metadatas)
        log_debug(message=f"Reloading {count} currently in hd2/data")
        for metadata in metadatas:
            self.add_metadata(metadata)
            file_hash = metadata.get('original_hash')
            file_name = metadata.get('file_name')
            file_status = metadata.get('rebuild_status')
            hash_data[file_hash] = {
                "file_name":
                file_name,  # todo: refactor this so that it is not done here
                "file_status": file_status
            }  #       (which happened due to the performance hit of the current Hash_Json file)
            #       when using:
            #         hash_json.add_file(file_hash=file_hash, file_name=file_name)
            #         hash_json.update_status(index=file_hash, updated_status=file_status)
        hash_json.save()
        return count

    @log_duration
    def reload_hash_json(self):
        hash_json = Hash_Json().reset()
        hash_data = hash_json.data()
        metadatas = self.storage.hd2_metadatas()
        for metadata in metadatas:
            file_hash = metadata.get('original_hash')
            file_name = metadata.get('file_name')
            file_status = metadata.get('rebuild_status')
            hash_data[file_hash] = {
                "file_name":
                file_name,  # todo: refactor this so that it is not done here
                "file_status": file_status
            }  # (which happened due to the performance hit of the current Hash_Json file)
        hash_json.save()
        return f'Hash_Json reloaded for {len(metadatas)} metadata items'

    def reload_elastic_data(self):
        self.delete_all_metadata()
        count = self.reload_metadatas()

        return f'Elastic {self.index_name} has been reset and {count} metadata items reloaded'

    def reload_kibana_dashboards(self):
        kibana = self.elastic().kibana()
        dashboard_file_names = [
            'processed-files-v8.ndjson', 'File-Analysis-Threat-Level.ndjson'
        ]
        result = []
        for dashboard_file_name in dashboard_file_names:
            result.append(
                kibana.dashboard_import_from_github(
                    dashboard_file_name=dashboard_file_name))
        return f"reloaded {len(result)} dashboards"
class File_Distributor:
    def __init__(self):
        self.config             = Config()
        self.hd1_base_location  = self.config.hd1_location
        self.hd2_base_location  = self.config.hd2_location
        self.hd3_base_location  = self.config.hd3_location
        self.zip_folder         = os.path.join(os.getcwd(),"zip_folder")
        self.storage            = Storage()

        folder_delete_all(self.zip_folder)
        create_folder(self.zip_folder)

    # def get_hd1_files(self,num_of_files):
    #     try:
    #         list=[]
    #         count=0
    #         for folderName, subfolders, filenames in os.walk(self.hd1_base_location):
    #             for filename in filenames:
    #                     self.hd1_path =  os.path.join(folderName, filename)
    #                     if os.path.isfile(self.hd1_path):
    #                         list.append(self.hd1_path)
    #                         count=count+1
    #                     if count == num_of_files :
    #                         break
    #             if count == num_of_files:
    #                 break
    #         target_file_path=self.prepare_zip(list,"hd1.zip")
    #         return target_file_path
    #
    #     except Exception as error:
    #         logger.error(f"File_Distributor: get_hd1_files : {error}")
    #         raise error

    # def get_hd3_files(self, num_of_files):
    #     try:
    #         list = []
    #         count = 0
    #         for folderName, subfolders, filenames in os.walk(self.hd3_base_location):
    #             for filename in filenames:
    #                 self.hd3_path = os.path.join(folderName, filename)
    #                 if os.path.isfile(self.hd3_path):
    #                     list.append(self.hd3_path)
    #                     count = count + 1
    #                 if count == num_of_files:
    #                     break
    #             if count == num_of_files:
    #                 break
    #         target_file_path=self.prepare_zip(list,"hd3.zip")
    #         return target_file_path
    #
    #     except Exception as error:
    #         logger.error(f"File_Distributor: get_hd3_files : {error}")
    #         raise error

    def get_hd2_status(self):
        try:
            base_path = self.storage.hd2_status()
            if not os.listdir(base_path) :
                return -1

            target_file_path = self.prepare_zip(base_path, "hd2_status_files.zip")
            return target_file_path
        except Exception as error:
            logger.error(f"File_Distributor: get_hd2_status : {error}")
            raise error

    def get_hd2_data(self, num_of_files):
        try:
            base_path = self.storage.hd2_data()
            if num_of_files == 0:
                return 0
            if not os.listdir(base_path):
                return -1

            list = []
            if num_of_files == -1:
                for folder in os.listdir(base_path):
                    list.append(os.path.join(base_path, folder))
            else:
                count = 0
                for folder in os.listdir(base_path):
                    list.append(os.path.join(base_path,folder))
                    count=count+1
                    if count == num_of_files:
                        break

            target_file_path = self.prepare_hd2_hash_folder_zip(list,"hd2_data_files.zip")
            return target_file_path

        except Exception as error:
            logger.error(f"File_Distributor: get_hd2_data : {error}")
            raise error

    def get_hd2_processed(self, num_of_files):
        try:
            base_path = self.storage.hd2_processed()
            if num_of_files == 0:
                return 0
            if not os.listdir(base_path):
                return -1

            list  = []
            if num_of_files == -1:
                for folder in os.listdir(base_path):
                    list.append(os.path.join(base_path, folder))
            else:
                count = 0
                for folder in os.listdir(base_path):
                    list.append(os.path.join(base_path, folder))
                    count = count + 1
                    if count == num_of_files:
                        break

            target_file_path = self.prepare_hd2_hash_folder_zip(list, "hd2_processed_files.zip")
            return target_file_path

        except Exception as error:
            logger.error(f"File_Distributor: get_hd2_processed : {error}")
            raise error

    def prepare_hd2_hash_folder_zip(self,path_list, zip_name):
        try:
            self.temp_folder = temp_folder()

            for hash_folder_path in path_list:
                name = ntpath.basename(hash_folder_path)
                dst_path = os.path.join(self.temp_folder, name)

                if os.path.isdir(hash_folder_path):
                    folder_copy(hash_folder_path, dst_path)

                    hd2_source_file = os.path.join(dst_path, "source")
                    if os.path.isfile(hd2_source_file):
                        file_delete(hd2_source_file)

            target_file_path = os.path.join(self.zip_folder, zip_name)
            zip_files(self.temp_folder, file_pattern='*.*', target_file = target_file_path)
            folder_delete_all(self.temp_folder)

            return target_file_path

        except Exception as error:
            logger.error(f"File_Distributor: prepare_zip : {error}")
            raise error

    def prepare_zip(self, path, zip_name):
        try:
            self.temp_folder = temp_folder()
            dst_path = os.path.join(self.temp_folder, ntpath.basename(path))

            if os.path.isfile(path):
                file_copy(path, dst_path)
            elif os.path.isdir(path):
                folder_copy(path, dst_path)

            target_file_path = os.path.join(self.zip_folder, zip_name)

            zip_files(self.temp_folder, file_pattern='*.*', target_file = target_file_path)

            folder_delete_all(self.temp_folder)

            return target_file_path

        except Exception as error:
            logger.error(f"File_Distributor: prepare_zip : {error}")
            raise error
Example #17
0
 def setUp(self) -> None:
     self.config        = Config()
     self.local_storage = Storage()
class Status:

    STATUS_FILE_NAME = "status.json"
    VAR_COMPLETED = "completed"
    VAR_CURRENT_STATUS = "current_status"
    VAR_FAILED = "failed"
    VAR_FILES_TO_PROCESS = "files_to_process"
    VAR_FILES_LEFT_TO_PROCESS = "files_left_to_process"
    VAR_FILES_COUNT = "files_in_hd1_folder"
    VAR_FILES_COPIED = "files_copied"
    VAR_FILES_TO_BE_COPIED = "files_left_to_be_copied"
    VAR_IN_PROGRESS = "in_progress"
    VAR_NUMBER_OF_CPUS = "number_of_cpus"
    VAR_CPU_UTILIZATION = "cpu_utilization"
    VAR_RAM_UTILIZATION = "memory_utilization"
    VAR_NUM_OF_PROCESSES = "number_of_processes"
    VAR_NUM_OF_THREADS = "number_of_threads"
    VAR_NETWORK_CONNECTIONS = "network_connections"
    VAR_DISK_PARTITIONS = "disk_partitions"

    lock = threading.Lock()

    _instance = None

    def __new__(cls):  # singleton pattern
        if cls._instance is None:
            cls._instance = super(Status, cls).__new__(cls)
        return cls._instance

    def __init__(self):
        if hasattr(self, 'instantiated'
                   ) is False:  # only set these values first time around
            self.instantiated = True
            self.storage = Storage()
            #self._on_save      = []                             # todo: add support for firing up events when data is saved
            self._status_data = self.default_data()
            self.status_thread_on = False
            self.status_thread = threading.Thread()

    @classmethod
    def clear_instance(cls):
        del cls.instance

    def StatusThread(self, update_interval):
        while self.status_thread_on:
            self.get_server_status()
            sleep(update_interval)

    def StartStatusThread(self):
        if self.status_thread_on:
            return

        self.status_thread_on = True
        self.status_thread = threading.Thread(target=self.StatusThread,
                                              args=(1, ))
        self.status_thread.start()

    def StopStatusThread(self):
        self.status_thread_on = False
        self.status_thread.join()

    def data(self):
        return self._status_data

    def default_data(self):
        return {
            Status.VAR_CURRENT_STATUS: FileStatus.NONE,
            Status.VAR_FILES_COUNT: 0,
            Status.VAR_FILES_COPIED: 0,
            Status.VAR_FILES_TO_BE_COPIED: 0,
            Status.VAR_FILES_TO_PROCESS: 0,
            Status.VAR_FILES_LEFT_TO_PROCESS: 0,
            Status.VAR_COMPLETED: 0,
            Status.VAR_FAILED: 0,
            Status.VAR_IN_PROGRESS: 0,
            Status.VAR_NUMBER_OF_CPUS: psutil.cpu_count(),
            Status.VAR_CPU_UTILIZATION: None,
            Status.VAR_RAM_UTILIZATION: None,
            Status.VAR_NUM_OF_PROCESSES: None,
            Status.VAR_NUM_OF_THREADS: None,
            Status.VAR_NETWORK_CONNECTIONS: None,
            Status.VAR_DISK_PARTITIONS: len(psutil.disk_partitions()),
        }

    def load_data(self):
        self._status_data = json_load_file(self.status_file_path())
        if self.data() == {}:
            self.reset()
        return self

    def reset(self):
        self._status_data = self.default_data()
        self.save()
        return self

    def save(self):
        if not file_exists(self.status_file_path()):
            folder_create(self.storage.hd2_status())
            file_create(self.status_file_path())

        json_save_file_pretty(self.data(), self.status_file_path())
        return self

    def status_file_path(self):
        return path_combine(self.storage.hd2_status(), Status.STATUS_FILE_NAME)

    def get_server_data(self):
        self._status_data[Status.VAR_NUMBER_OF_CPUS] = psutil.cpu_count()

        self._status_data[Status.VAR_CPU_UTILIZATION] = psutil.cpu_percent(
            interval=1, percpu=True)
        self._status_data[
            Status.VAR_RAM_UTILIZATION] = psutil.virtual_memory().percent

        pids = psutil.pids()
        self._status_data[Status.VAR_NUM_OF_PROCESSES] = len(pids)

        thread_count = 0
        for pid in pids:
            try:
                p = psutil.Process(int(pid))
                process_treads = p.num_threads()
                thread_count += process_treads
            except:
                pass

        self._status_data[Status.VAR_NUM_OF_THREADS] = thread_count

        self._status_data[Status.VAR_NETWORK_CONNECTIONS] = len(
            psutil.net_connections(kind='tcp'))

        self._status_data[Status.VAR_DISK_PARTITIONS] = len(
            psutil.disk_partitions())

    def get_server_status(self):
        Status.lock.acquire()
        try:
            self.get_server_data()
        finally:
            Status.lock.release()
            self.save()

        return self

    def set_processing_status(self, processing_status):
        Status.lock.acquire()
        try:
            data = self.data()
            data[Status.VAR_CURRENT_STATUS] = processing_status
        finally:
            Status.lock.release()
            self.save()

        return self

    def set_started(self):
        return self.set_processing_status(Processing_Status.STARTED)

    def set_stopped(self):
        return self.set_processing_status(Processing_Status.STOPPED)

    def set_phase_1(self):
        return self.set_processing_status(Processing_Status.PHASE_1)

    def set_phase_2(self):
        return self.set_processing_status(Processing_Status.PHASE_2)

    def update_counters(self, updated_status, count=0):
        Status.lock.acquire()
        try:
            data = self.data()

            if updated_status == FileStatus.NONE:
                data[Status.VAR_FILES_COUNT] = count
                data[Status.VAR_FILES_TO_BE_COPIED] = count

            elif updated_status == FileStatus.INITIAL:
                data[Status.VAR_FILES_COPIED] += 1
                if data[Status.VAR_FILES_TO_BE_COPIED] > 0:
                    data[Status.VAR_FILES_TO_BE_COPIED] -= 1

            elif updated_status == FileStatus.NOT_COPIED:
                if data[Status.VAR_FILES_TO_BE_COPIED] > 0:
                    data[Status.VAR_FILES_TO_BE_COPIED] -= 1

            elif updated_status == FileStatus.IN_PROGRESS:
                data[Status.VAR_IN_PROGRESS] += 1

            elif updated_status == FileStatus.COMPLETED:
                data[Status.VAR_COMPLETED] += 1
                if data[Status.VAR_IN_PROGRESS] > 0:
                    data[Status.VAR_IN_PROGRESS] -= 1
                if data[Status.VAR_FILES_LEFT_TO_PROCESS] > 0:
                    data[Status.VAR_FILES_LEFT_TO_PROCESS] -= 1

            elif updated_status == FileStatus.FAILED:
                data[Status.VAR_FAILED] += 1
                if data[Status.VAR_IN_PROGRESS] > 0:
                    data[Status.VAR_IN_PROGRESS] -= 1
                if data[Status.VAR_FILES_LEFT_TO_PROCESS] > 0:
                    data[Status.VAR_FILES_LEFT_TO_PROCESS] -= 1

            elif updated_status == FileStatus.TO_PROCESS:
                data[Status.VAR_FILES_TO_PROCESS] += 1
                data[Status.VAR_FILES_LEFT_TO_PROCESS] += 1

        finally:
            Status.lock.release()
            self.save()

        return self

    def set_processing_counters(self, count):
        Status.lock.acquire()
        try:
            data = self.data()

            data[Status.VAR_IN_PROGRESS] = 0
            data[Status.VAR_FAILED] = 0
            data[Status.VAR_COMPLETED] = 0

            data[Status.VAR_FILES_TO_PROCESS] = count
            data[Status.VAR_FILES_LEFT_TO_PROCESS] = count

        finally:
            Status.lock.release()
            self.save()

        return self

    def add_completed(self):
        return self.update_counters(FileStatus.COMPLETED)

    def add_failed(self):
        return self.update_counters(FileStatus.FAILED)

    def add_file(self):
        return self.update_counters(FileStatus.INITIAL)

    def set_files_count(self, count):
        return self.update_counters(FileStatus.NONE, count)

    def set_not_copied(self):
        return self.update_counters(FileStatus.NOT_COPIED)

    def add_in_progress(self):
        return self.update_counters(FileStatus.IN_PROGRESS)

    def add_to_be_processed(self):
        return self.update_counters(FileStatus.TO_PROCESS)

    def get_completed(self):
        return self.data().get(Status.VAR_COMPLETED)

    def get_current_status(self):
        return self.data().get(Status.VAR_CURRENT_STATUS)

    def get_failed(self):
        return self.data().get(Status.VAR_FAILED)

    def get_files_count(self):
        return self.data().get(Status.VAR_FILES_COUNT)

    def get_files_copied(self):
        return self.data().get(Status.VAR_FILES_COPIED)

    def get_files_to_process(self):
        return self.data().get(Status.VAR_FILES_TO_PROCESS)

    def get_in_progress(self):
        return self.data().get(Status.VAR_IN_PROGRESS)
Example #19
0
class Pre_Processor:
    def __init__(self):
        self.config = Config()
        self.meta_service = Metadata_Service()
        self.status = Status()
        self.storage = Storage()
        self.file_name = None  # set in process() method
        self.current_path = None
        self.base_folder = None
        self.dst_folder = None
        self.dst_file_name = None

        self.status = Status()
        self.status.reset()

        #self.analysis_json = Analysis_Json()

    @log_duration
    def clear_data_and_status_folders(self):
        data_target = self.storage.hd2_data(
        )  # todo: refactor this clean up to the storage class
        status_target = self.storage.hd2_status()
        processed_target = self.storage.hd2_processed()
        folder_delete_all(data_target)
        folder_delete_all(status_target)
        folder_delete_all(processed_target)
        folder_create(data_target)
        folder_create(status_target)
        folder_create(processed_target)
        self.status.reset()

    def file_hash(self, file_path):
        return self.meta_service.file_hash(file_path)

    def prepare_folder(self, folder_to_process):
        if folder_to_process.startswith(self.storage.hd1()):
            return folder_to_process

        dirname = os.path.join(self.storage.hd1(),
                               os.path.basename(folder_to_process))
        if os.path.isdir(dirname):
            folder_delete_all(dirname)
        try:
            folder_copy(folder_to_process, dirname)
        finally:
            return dirname

    def process_folder(self, folder_to_process):
        if not os.path.isdir(folder_to_process):
            # todo: add an event log
            return False

        folder_to_process = self.prepare_folder(folder_to_process)

        files_count = 0

        for folderName, subfolders, filenames in os.walk(folder_to_process):
            for filename in filenames:
                file_path = os.path.join(folderName, filename)
                if os.path.isfile(file_path):
                    files_count += 1

        self.status.set_files_count(files_count)

        for folderName, subfolders, filenames in os.walk(folder_to_process):
            for filename in filenames:
                file_path = os.path.join(folderName, filename)
                if os.path.isfile(file_path):
                    self.process(file_path)

        return True

    @log_duration
    def process_files(self):
        self.status.StartStatusThread()
        self.status.set_phase_1()
        self.process_folder(self.storage.hd1())
        self.status.set_phase_2()
        self.status.StopStatusThread()

    @log_duration
    def process(self, file_path):
        tik = datetime.now()

        metadata = self.meta_service.create_metadata(file_path=file_path)
        file_name = metadata.get_file_name()
        original_hash = metadata.get_original_hash()
        status = metadata.get_rebuild_status()
        self.update_status(file_name, original_hash, status)

        tok = datetime.now()
        delta = tok - tik

        if metadata.is_in_todo():
            hash_folder_path = self.storage.hd2_data(original_hash)
            self.meta_service.set_hd1_to_hd2_copy_time(hash_folder_path,
                                                       delta.total_seconds())
        else:
            self.status.set_not_copied()

    def update_status(self, file_name, original_hash, status):
        if status == FileStatus.INITIAL:
            self.status.add_file()
Example #20
0
class Metadata:
    def __init__(self, file_hash=None):
        self.config = Config()
        self.storage = Storage()
        self.process_status = Status()
        self.metadata_utils = Metadata_Utils()
        self.path_hd1 = self.storage.hd1()
        self.data = self.default_data()
        self.file_hash = file_hash
        #self.time_field    =

    def get_from_file(self):  # todo: refactor out this method
        self.load()
        return self.data

    def load(self):
        with open(self.metadata_file_path()) as json_file:
            self.data = json.load(json_file)
        return self

    def add_file(self, file_path):
        if file_exists(file_path):
            tik = datetime.now()

            self.set_file_hash(self.metadata_utils.file_hash(file_path))

            tok = datetime.now()
            delta = tok - tik
            self.set_file_hash_calculation_time(delta.total_seconds())

            if self.exists():
                self.get_from_file()
            else:
                self.create(file_path)
            self.add_file_path(file_path)
            self.save()
            return self.file_hash

    def add_file_path(self, file_path: str):
        if self.file_hash:
            file_paths = self.data.get('original_file_paths')
            if 0 == len(file_paths):
                self.process_status.add_to_be_processed()
            if file_path.startswith(
                    self.path_hd1):  # check if path starts with hd1
                file_path = os.path.relpath(file_path, self.path_hd1)
            if file_path not in file_paths:
                file_paths.append(file_path)
            return file_paths

    def create(self, file_path):
        if self.file_hash:
            folder_create(self.metadata_folder_path())
            file_copy(file_path, self.source_file_path())
            self.set_original_file_size(file_path)
            self.set_original_file_extension(file_path)
            self.set_original_file_name(file_path)

    def default_data(self):
        return {
            'file_name': None,
            'xml_report_status': None,
            'last_update_time': None,
            'rebuild_server': None,
            'server_version': None,
            'error': None,
            'original_file_paths': [],
            'original_hash': None,
            'original_hash_calculation_time': None,
            'original_file_extension': None,
            'original_file_size': None,
            'rebuild_file_path': None,
            'rebuild_hash': None,
            'rebuild_status': FileStatus.INITIAL,
            'rebuild_file_extension': None,
            'rebuild_file_size': None,
            'rebuild_file_duration': None,
            'f2f_plugin_version': None,
            'f2f_plugin_git_commit': None,
            'hd1_to_hd2_copy_time': None,
            'hd2_to_hd3_copy_time': None
        }

    def delete(self):
        if self.exists():
            folder_delete_all(self.metadata_folder_path())
            return self.exists() is False
        return False

    def exists(self):
        return folder_exists(self.metadata_folder_path())

    def metadata_file_exists(self):
        return file_exists(self.metadata_file_path())

    def metadata_file_path(self):
        if self.file_hash:  # todo: find a better solution that having to add this to all methods
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_METADATA_FILENAME)

    def metadata_folder_path(self):
        if not self.file_hash:
            return

        path = self.storage.hd2_not_processed(self.file_hash)
        if folder_exists(path):
            return path

        path = self.storage.hd2_processed(self.file_hash)
        if folder_exists(path):
            return path

        # never processed - must be in the 'todo' folder
        path = self.storage.hd2_data(self.file_hash)
        return path

    def is_in_todo(self):
        folder_exists(self.storage.hd2_data(self.file_hash))

    def is_in_processed(self):
        folder_exists(self.storage.hd2_processed(self.file_hash))

    def is_in_not_processed(self):
        folder_exists(self.storage.hd2_not_processed(self.file_hash))

    def save(self):
        if self.exists():
            json_save_file_pretty(python_object=self.data,
                                  path=self.metadata_file_path())

    def update_field(
        self, field, updated_value
    ):  # todo: optimise this if we get performance hits due to multiple updates
        self.data[field] = updated_value
        self.data['last_update_time'] = datetime_now()
        self.save()

    def set_file_hash(self, file_hash):
        self.file_hash = file_hash
        self.data['original_hash'] = file_hash
        self.data['last_update_time'] = datetime_now()
        if not self.exists():
            self.save()

    def set_file_hash_calculation_time(self, seconds):
        self.data['original_hash_calculation_time'] = seconds

    def set_original_file_name(self, file_path):
        original_file_name = file_name(file_path)
        self.update_field('file_name', original_file_name)

    def set_original_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        self.update_field('original_file_size', file_size)

    def set_original_file_extension(self, file_path):
        extension = pathlib.Path(file_path).suffix
        self.update_field('original_file_extension', extension)

    def source_file_path(self):
        if self.file_hash:
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_SOURCE_FILENAME)

    def get_original_hash(self):
        return self.data.get('original_hash')

    def get_file_hash(self):
        return self.file_hash

    def get_file_name(self):
        return self.data.get('file_name')

    def get_rebuild_status(self):
        return self.data.get('rebuild_status')

    def get_original_file_paths(self):
        return self.data.get('original_file_paths')

    def get_last_update_time(self):
        return self.data.get('last_update_time')

    def get_error(self):
        return self.data.get('error')

    def get_original_file_extension(self):
        return self.data.get('original_file_extension')

    def report_file_path(self):
        if self.file_hash:
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_REPORT_FILENAME)

    def report_file_exists(self):
        return file_exists(self.report_file_path())