Exemple #1
0
 def __init__(self, file_hash=None):
     self.config = Config()
     self.storage = Storage()
     self.process_status = Status()
     self.metadata_utils = Metadata_Utils()
     self.path_hd1 = self.storage.hd1()
     self.data = self.default_data()
     self.file_hash = file_hash
Exemple #2
0
    def __init__(self):
        self.config = Config()
        self.meta_service = Metadata_Service()
        self.status = Status()
        self.storage = Storage()
        self.file_name = None  # set in process() method
        self.current_path = None
        self.base_folder = None
        self.dst_folder = None
        self.dst_file_name = None

        self.status = Status()
        self.status.reset()
Exemple #3
0
 def __init__(self):
     self.use_es = False
     self.config = Config()
     self.status = Status()
     self.storage = Storage()
     self.hash_json = Hash_Json()
     self.events = Events_Log(self.config.hd2_status_location)
     self.events_elastic = Events_Log_Elastic()
     self.hash = None
     self.report_elastic = Report_Elastic()
     self.analysis_elastic = Analysis_Elastic()
     self.report_elastic.setup()
     self.analysis_elastic.setup()
     create_folder(self.storage.hd2_processed())
     create_folder(self.storage.hd2_not_processed())
Exemple #4
0
    def __init__(self, events_log, events_elastic, report_elastic,
                 analysis_elastic, meta_service):
        self.meta_service = meta_service
        self.events_log = events_log
        self.events_elastic = events_elastic
        self.storage = Storage()
        self.config = Config()
        self.status = Status()
        self.hash_json = Hash_Json()
        self.report_elastic = report_elastic
        self.sdk_api_version = "Not available"
        self.sdk_engine_version = "Not available"

        self.analysis_json = Analysis_Json()
        self.analysis_elastic = analysis_elastic
Exemple #5
0
class Loops(object):

    continue_processing = False
    processing_started = False
    lock = asyncio.Lock()

    def __init__(self):
        self.use_es = False
        self.config = Config()
        self.status = Status()
        self.storage = Storage()
        self.hash_json = Hash_Json()
        self.events = Events_Log(self.config.hd2_status_location)
        self.events_elastic = Events_Log_Elastic()
        self.hash = None
        self.report_elastic = Report_Elastic()
        self.analysis_elastic = Analysis_Elastic()
        self.report_elastic.setup()
        self.analysis_elastic.setup()
        create_folder(self.storage.hd2_processed())
        create_folder(self.storage.hd2_not_processed())

    def IsProcessing(self):
        return Loops.processing_started

    def StopProcessing(self):
        Loops.continue_processing = False

    def HasBeenStopped(self):
        return not Loops.continue_processing

    def git_commit(self):
        git_commit = 'Not available'
        try:
            git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'
                                                  ]).decode("utf-8").rstrip()
        except Exception as e:
            pass

        return git_commit

    def ProcessDirectoryWithEndpoint(self, itempath, file_hash,
                                     endpoint_index):

        if not os.path.isdir(itempath):
            return False

        log_info(
            message=
            f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}"
        )
        meta_service = Metadata_Service()
        original_file_path = meta_service.get_original_file_paths(itempath)
        events = Events_Log(itempath)

        endpoint = "http://" + self.config.endpoints['Endpoints'][
            endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][
                endpoint_index]['Port']
        events.add_log("Processing with: " + endpoint)

        meta_service.set_f2f_plugin_version(itempath, API_VERSION)
        meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit())

        try:
            file_processing = File_Processing(events, self.events_elastic,
                                              self.report_elastic,
                                              self.analysis_elastic,
                                              meta_service)
            if not file_processing.processDirectory(endpoint, itempath):
                events.add_log("CANNOT be processed")
                return False

            log_data = {
                'file': original_file_path,
                'status': FileStatus.COMPLETED,
                'error': 'none',
                'timestamp': datetime.now(),
            }
            log_info('ProcessDirectoryWithEndpoint', data=log_data)
            meta_service.set_error(itempath, "none")
            meta_service.set_status(itempath, FileStatus.COMPLETED)
            self.hash_json.update_status(file_hash, FileStatus.COMPLETED)
            events.add_log("Has been processed")
            return True
        except Exception as error:
            log_data = {
                'file': original_file_path,
                'status': FileStatus.FAILED,
                'error': str(error),
            }
            log_error(message='error in ProcessDirectoryWithEndpoint',
                      data=log_data)
            meta_service.set_error(itempath, str(error))
            meta_service.set_status(itempath, FileStatus.FAILED)
            self.hash_json.update_status(file_hash, FileStatus.FAILED)
            events.add_log("ERROR:" + str(error))
            return False

    def ProcessDirectory(self, thread_data):
        (itempath, file_hash, process_index) = thread_data
        endpoint_index = process_index % self.config.endpoints_count
        if not Loops.continue_processing:
            return False
        tik = datetime.now()
        process_result = self.ProcessDirectoryWithEndpoint(
            itempath, file_hash, endpoint_index)

        if process_result:
            self.status.add_completed()

            tok = datetime.now()
            delta = tok - tik

            meta_service = Metadata_Service()
            meta_service.set_hd2_to_hd3_copy_time(itempath,
                                                  delta.total_seconds())
        else:
            self.status.add_failed()

        return process_result

        # note: removing retries from this method (it should not be handled like this
        #for idx in range(self.config.endpoints_count):
        #    if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index):
        #        return
        #    # The Endpoint failed to process the file
        #    # Retry it with the next one
        #    endpoint_index = (endpoint_index + 1) % self.config.endpoints_count

    def updateHashJson(self):
        self.hash_json.reset()
        meta_service = Metadata_Service()

        for hash_folder in os.listdir(self.storage.hd2_data()):

            metadata_folder = self.storage.hd2_data(hash_folder)

            if not os.path.isdir(metadata_folder):
                continue

            metadata = meta_service.get_from_file(metadata_folder)
            file_name = metadata.get_file_name()
            original_hash = metadata.get_original_hash()
            status = metadata.get_rebuild_status()

            if status != FileStatus.COMPLETED:
                self.hash_json.add_file(original_hash, file_name)

        self.hash_json.save()
        self.status.set_processing_counters(len(self.hash_json.data()))
        return self.hash_json.data()

    def moveProcessedFiles(self):
        json_list = self.hash_json.data()

        for key in json_list:

            source_path = self.storage.hd2_data(key)

            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                destination_path = self.storage.hd2_processed(key)

                if folder_exists(destination_path):
                    folder_delete_all(destination_path)

                shutil.move(source_path, destination_path)

            if (FileStatus.FAILED == json_list[key]["file_status"]):

                meta_service = Metadata_Service()
                meta_service.get_from_file(source_path)
                metadata = meta_service.metadata
                if ("Engine response could not be decoded" == metadata.get_error()) and \
                    metadata.get_original_file_extension() in ['.xml', '.json']:
                    destination_path = self.storage.hd2_not_processed(key)

                    if folder_exists(destination_path):
                        folder_delete_all(destination_path)

                    shutil.move(source_path, destination_path)

    def LoopHashDirectoriesInternal(self, thread_count, do_single):

        if folder_exists(self.storage.hd2_data()) is False:
            log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data(
            )
            log_error(log_message)
            return False

        if not isinstance(thread_count, int):
            raise TypeError("thread_count must be a integer")

        if not isinstance(do_single, bool):
            raise TypeError("thread_count must be a integer")

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        json_list = self.updateHashJson()

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        threads = list()

        process_index = 0

        log_info(
            message=f'before Mapping thread_data for {len(json_list)} files')
        thread_data = []
        for key in json_list:
            file_hash = key

            itempath = self.storage.hd2_data(key)
            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                self.events.add_log(
                    f"The file processing has been already completed")
                continue

            if not os.path.exists(itempath):
                self.events.add_log(
                    f"ERROR: Path \"{itempath}\" does not exist")
                json_list[key]["file_status"] = FileStatus.FAILED
                continue

            process_index += 1
            thread_data.append((
                itempath,
                file_hash,
                process_index,
            ))
            # # limit the number of parallel threads
            #
            # if process_index % int(thread_count) == 0:                      # todo: refactor this workflow to use multiprocess and queues
            #     # Clean up the threads
            #     for index, thread in enumerate(threads):                    # todo: since at the moment this will block allocating new threads until
            #         thread.join()                                           #       all have finishing execution
            #
            # process_index += 1
            # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }")
            # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,))
            # threads.append(x)
            # x.start()
            #
            # if do_single:
            #     break
            #
            # if not Loops.continue_processing:
            #     break

        # for index, thread in enumerate(threads):
        #     thread.join()

        log_info(
            message=
            f'after mapped thread_data, there are {len(thread_data)} mapped items'
        )
        #thread_data = thread_data[:500]
        #log_info(message=f'to start with only processing {len(thread_data)} thread_data items')
        pool = ThreadPool(thread_count)
        results = pool.map(self.ProcessDirectory, thread_data)
        pool.close()
        pool.join()

        self.moveProcessedFiles()

        self.events.add_log("LoopHashDirectoriesInternal finished")
        return True

    async def LoopHashDirectoriesAsync(self, thread_count, do_single=False):
        await Loops.lock.acquire()
        try:
            Loops.continue_processing = True
            Loops.processing_started = True
            self.status.set_started()
            self.LoopHashDirectoriesInternal(thread_count, do_single)
        finally:
            Loops.processing_started = False
            Loops.lock.release()
            self.status.set_stopped()
            self.hash_json.save()

    @log_duration
    def LoopHashDirectories(self, thread_count=None):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                message=
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        self.status.StartStatusThread()
        thread_count = thread_count or self.config.thread_count
        log_info(message="in LoopHashDirectories, about to start main loop")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count))
        log_info(message="in LoopHashDirectories, Loop completed")
        self.status.StopStatusThread()
        return True

    @log_duration
    def LoopHashDirectoriesSequential(self):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1))
        return True

    @log_duration
    def ProcessSingleFile(self):
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True))
        return True
Exemple #6
0
class File_Processing:
    def __init__(self, events_log, events_elastic, report_elastic,
                 analysis_elastic, meta_service):
        self.meta_service = meta_service
        self.events_log = events_log
        self.events_elastic = events_elastic
        self.storage = Storage()
        self.config = Config()
        self.status = Status()
        self.hash_json = Hash_Json()
        self.report_elastic = report_elastic
        self.sdk_api_version = "Not available"
        self.sdk_engine_version = "Not available"

        self.analysis_json = Analysis_Json()
        self.analysis_elastic = analysis_elastic

    def add_event_log(self, message, event_data={}):
        json_data = self.events_log.add_log(message, event_data)
        self.events_elastic.add_event_log(json_data)

    def base64request(self, endpoint, api_route, base64enc_file):
        try:
            url = endpoint + "/" + api_route

            payload = json.dumps({"Base64": base64enc_file})

            headers = {'Content-Type': 'application/json'}

            return requests.request("POST",
                                    url,
                                    headers=headers,
                                    data=payload,
                                    timeout=int(self.config.request_timeout))

        except Exception as e:
            log_error(str(e))
            raise ValueError(str(e))

    def xmlreport_request(self, endpoint, fileID):
        try:
            url = endpoint + "/api/Analyse/xmlreport?fileId=" + fileID

            payload = ""
            headers = {'Content-Type': 'application/octet-stream'}

            response = requests.request("GET",
                                        url,
                                        headers=headers,
                                        data=payload,
                                        timeout=int(
                                            self.config.request_timeout))
            return response.text

        except Exception as e:
            raise ValueError(str(e))

    def rebuild(self, endpoint, base64enc_file):
        return self.base64request(endpoint, "api/rebuild/base64",
                                  base64enc_file)

    def get_xmlreport(self, endpoint, fileId, dir):
        log_info(message=f"getting XML Report for {fileId} at {endpoint}")

        xmlreport = self.xmlreport_request(endpoint, fileId)
        if not xmlreport:
            raise ValueError('Failed to obtain the XML report')

        try:
            json_obj = xmltodict.parse(xmlreport)

            file_extension = json_obj["gw:GWallInfo"]["gw:DocumentStatistics"][
                "gw:DocumentSummary"]["gw:FileType"]
            self.meta_service.set_rebuild_file_extension(dir, file_extension)
            json_obj['original_hash'] = os.path.basename(dir)
            json_save_file_pretty(json_obj, os.path.join(dir, "report.json"))

            #self.report_elastic.add_report(json_obj)

            analysis_obj = self.analysis_json.get_file_analysis(
                os.path.basename(dir), json_obj)
            json_save_file_pretty(analysis_obj,
                                  os.path.join(dir, "analysis.json"))

            self.analysis_elastic.add_analysis(analysis_obj)

            return True
        except Exception as error:
            log_error(
                message=f"Error in parsing xmlreport for {fileId} : {error}")
            return False

    # Save to HD3
    def save_file(self, result, processed_path):
        self.add_event_log('Saving to: ' + processed_path)

        dirname = ntpath.dirname(processed_path)
        basename = ntpath.basename(processed_path)
        folder_create(dirname)

        decoded = FileService.base64decode(result)

        if decoded:
            FileService.wrtie_binary_file(dirname, basename, decoded)
            self.add_event_log('The decoded file has been saved')
            return processed_path
        else:
            FileService.wrtie_file(
                dirname, basename + ".html",
                result)  # todo: capture better this workflow
            self.add_event_log('Decoding FAILED. The HTML file has been saved')
            return processed_path + '.html'  # todo: refactor this workflow and how this is calculated

    @log_duration
    def do_rebuild(self, endpoint, hash, source_path, dir):
        log_info(
            message=f"Starting rebuild for file {hash} on endpoint {endpoint}")
        with Duration() as duration:
            event_data = {
                "endpoint": endpoint,
                "hash": hash,
                "source_path": source_path,
                "dir": dir
            }  # todo: see if we can use a variable that holds the params data
            self.add_event_log('Starting File rebuild', event_data)

            self.meta_service.set_rebuild_server(dir, endpoint)

            encodedFile = FileService.base64encode(source_path)
            if not encodedFile:
                message = f"Failed to encode the file: {hash}"
                log_error(message=message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False

            response = self.rebuild(endpoint, encodedFile)
            result = response.text
            if not result:
                message = f"Failed to rebuild the file : {hash}"
                log_error(message=message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False

            try:
                for path in self.meta_service.get_original_file_paths(dir):
                    #rebuild_file_path = path
                    if path.startswith(self.config.hd1_location):
                        rebuild_file_path = path.replace(
                            self.config.hd1_location, self.config.hd3_location)
                    else:
                        rebuild_file_path = os.path.join(
                            self.config.hd3_location, path)

                    folder_create(parent_folder(
                        rebuild_file_path))  # make sure parent folder exists

                    final_rebuild_file_path = self.save_file(
                        result, rebuild_file_path
                    )  # returns actual file saved (which could be .html)

                    # todo: improve the performance of these update since each will trigger a save
                    file_size = os.path.getsize(
                        final_rebuild_file_path)  # calculate rebuilt file fize
                    rebuild_hash = self.meta_service.file_hash(
                        final_rebuild_file_path
                    )  # calculate hash of final_rebuild_file_path

                    self.meta_service.set_rebuild_file_size(dir, file_size)
                    self.meta_service.set_rebuild_file_path(
                        dir, final_rebuild_file_path
                    )  # capture final_rebuild_file_path
                    self.meta_service.set_rebuild_hash(
                        dir, rebuild_hash)  # capture it
                if not FileService.base64decode(result):
                    message = f"Engine response could not be decoded"
                    log_error(message=message, data=f"{result}")
                    self.meta_service.set_error(dir, message)
                    return False
            except Exception as error:
                message = f"Error Saving file for {hash} : {error}"
                log_error(message=message)
                self.meta_service.set_xml_report_status(dir, "No Report")
                self.meta_service.set_error(dir, message)
                return False

            headers = response.headers
            fileIdKey = "X-Adaptation-File-Id"

            # get XML report
            if fileIdKey in headers:
                if self.get_xmlreport(endpoint, headers[fileIdKey], dir):
                    self.add_event_log('The XML report has been saved')
                    self.meta_service.set_xml_report_status(dir, "Obtained")
                else:
                    self.meta_service.set_xml_report_status(
                        dir, "No XML Report")
            else:
                self.meta_service.set_xml_report_status(
                    dir, "Failed to obtain")
                message = f'No X-Adaptation-File-Id header found in the response for {hash}'
                log_error(message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False
                #raise ValueError("No X-Adaptation-File-Id header found in the response")

            # todo: add when server side supports this
            # SDKEngineVersionKey = "X-SDK-Engine-Version"
            # SDKAPIVersionKey = "X-SDK-Api-Version"
            #
            # if SDKEngineVersionKey in headers:
            #     self.sdk_engine_version = headers[SDKEngineVersionKey]
            # if SDKAPIVersionKey in headers:
            #     self.sdk_api_version = headers[SDKAPIVersionKey]
            #
            # self.meta_service.set_server_version(dir, "Engine:" + self.sdk_engine_version + " API:" + self.sdk_api_version )
        log_info(
            message=
            f"rebuild ok for file {hash} on endpoint {endpoint} took {duration.seconds()} seconds"
        )
        return True

    @log_duration
    def processDirectory(self, endpoint, dir):
        self.add_event_log("Processing Directory: " + dir)
        hash = ntpath.basename(dir)
        if len(hash) != 64:
            self.add_event_log("Unexpected hash length")
            #raise ValueError("Unexpected hash length")
            return False

        metadata_file_path = os.path.join(dir,
                                          Metadata_Service.METADATA_FILE_NAME)
        if not (FileService.file_exist(metadata_file_path)):
            self.add_event_log("The metadate.json file does not exist")
            #raise ValueError("The metadate.json file does not exist")
            return False

        if self.meta_service.is_completed_status(dir):
            self.add_event_log("Metadata is in the COMPLETED state")
            return False

        self.add_event_log("Set metadata status IN_PROGRESS")
        self.meta_service.set_status_inprogress(dir)
        self.status.add_in_progress()

        source_path = os.path.join(dir, "source")
        if not (FileService.file_exist(source_path)):
            self.add_event_log("File does not exist")
            #raise ValueError("File does not exist")
            return False

        self.add_event_log("Sending to rebuild")
        tik = datetime.now()
        status = self.do_rebuild(endpoint, hash, source_path, dir)
        #        if status:
        #            self.meta_service.set_status(dir, FileStatus.COMPLETED)
        #            self.meta_service.set_error(dir, "none")
        #        else:
        if not status:
            self.meta_service.set_status(dir, FileStatus.FAILED)
            self.hash_json.update_status(hash, FileStatus.FAILED)

        tok = datetime.now()
        delta = tok - tik
        self.meta_service.set_rebuild_file_duration(dir, delta.total_seconds())

        return status
Exemple #7
0
 def setUp(self) -> None:
     self.status = Status()
     self.storage = self.status.storage
Exemple #8
0
class test_Status(Temp_Config):
    def setUp(self) -> None:
        self.status = Status()
        self.storage = self.status.storage

    def test__FileStatus(self):
        assert inspect.getmembers(FileStatus, lambda a: type(a) is str) == [
            ('COMPLETED', 'Completed Successfully'),
            ('FAILED', 'Completed with errors'), ('INITIAL', 'Initial'),
            ('IN_PROGRESS', 'In Progress'), ('NONE', 'None'),
            ('NOT_COPIED', 'Will not be copied'), ('TO_PROCESS', 'To Process'),
            ('__module__', 'cdr_plugin_folder_to_folder.pre_processing.Status')
        ]

    def test_server_status(self):
        status = self.status
        status.get_server_status()
        data = status.data()

        assert data[Status.VAR_NUMBER_OF_CPUS] > 0

        cpu_percents = data[Status.VAR_CPU_UTILIZATION]
        assert len(cpu_percents) > 0
        assert isinstance(cpu_percents[0], (int, float))
        assert cpu_percents[0] >= 0

        ram_percent = data[Status.VAR_RAM_UTILIZATION]
        assert isinstance(ram_percent, (int, float))
        assert ram_percent > 0

        processes_count = data[Status.VAR_NUM_OF_PROCESSES]
        assert isinstance(processes_count, (int))
        assert processes_count > 0

        assert data[Status.VAR_NETWORK_CONNECTIONS] >= 0

        assert data[Status.VAR_DISK_PARTITIONS] > 0

    def test_load_data(self):
        status = self.status
        assert status.data() == status.default_data()
        assert status.load_data().data() == status.default_data()
        assert status.get_files_count() == 0
        for i in range(1, 100):
            assert status.add_completed()
            assert status.get_completed() == i

            assert status.add_failed()
            assert status.get_failed() == i

            assert status.add_file()
            assert status.get_files_copied() == i

            assert status.add_in_progress()
            assert status.get_in_progress() == 1

            assert status.add_to_be_processed()
            assert status.get_files_to_process() == i

            assert status.set_stopped()
            assert status.get_current_status() == Processing_Status.STOPPED

            assert status.set_started()
            assert status.get_current_status() == Processing_Status.STARTED

            assert status.set_phase_1()
            assert status.get_current_status() == Processing_Status.PHASE_1

            assert status.set_phase_2()
            assert status.get_current_status() == Processing_Status.PHASE_2

        assert json_load_file(status.status_file_path()) == status.data()

    def test_status_file_path(self):
        assert self.status.status_file_path() == path_combine(
            self.storage.hd2_status(), Status.STATUS_FILE_NAME)
Exemple #9
0
class Pre_Processor:
    def __init__(self):
        self.config = Config()
        self.meta_service = Metadata_Service()
        self.status = Status()
        self.storage = Storage()
        self.file_name = None  # set in process() method
        self.current_path = None
        self.base_folder = None
        self.dst_folder = None
        self.dst_file_name = None

        self.status = Status()
        self.status.reset()

        #self.analysis_json = Analysis_Json()

    @log_duration
    def clear_data_and_status_folders(self):
        data_target = self.storage.hd2_data(
        )  # todo: refactor this clean up to the storage class
        status_target = self.storage.hd2_status()
        processed_target = self.storage.hd2_processed()
        folder_delete_all(data_target)
        folder_delete_all(status_target)
        folder_delete_all(processed_target)
        folder_create(data_target)
        folder_create(status_target)
        folder_create(processed_target)
        self.status.reset()

    def file_hash(self, file_path):
        return self.meta_service.file_hash(file_path)

    def prepare_folder(self, folder_to_process):
        if folder_to_process.startswith(self.storage.hd1()):
            return folder_to_process

        dirname = os.path.join(self.storage.hd1(),
                               os.path.basename(folder_to_process))
        if os.path.isdir(dirname):
            folder_delete_all(dirname)
        try:
            folder_copy(folder_to_process, dirname)
        finally:
            return dirname

    def process_folder(self, folder_to_process):
        if not os.path.isdir(folder_to_process):
            # todo: add an event log
            return False

        folder_to_process = self.prepare_folder(folder_to_process)

        files_count = 0

        for folderName, subfolders, filenames in os.walk(folder_to_process):
            for filename in filenames:
                file_path = os.path.join(folderName, filename)
                if os.path.isfile(file_path):
                    files_count += 1

        self.status.set_files_count(files_count)

        for folderName, subfolders, filenames in os.walk(folder_to_process):
            for filename in filenames:
                file_path = os.path.join(folderName, filename)
                if os.path.isfile(file_path):
                    self.process(file_path)

        return True

    @log_duration
    def process_files(self):
        self.status.StartStatusThread()
        self.status.set_phase_1()
        self.process_folder(self.storage.hd1())
        self.status.set_phase_2()
        self.status.StopStatusThread()

    @log_duration
    def process(self, file_path):
        tik = datetime.now()

        metadata = self.meta_service.create_metadata(file_path=file_path)
        file_name = metadata.get_file_name()
        original_hash = metadata.get_original_hash()
        status = metadata.get_rebuild_status()
        self.update_status(file_name, original_hash, status)

        tok = datetime.now()
        delta = tok - tik

        if metadata.is_in_todo():
            hash_folder_path = self.storage.hd2_data(original_hash)
            self.meta_service.set_hd1_to_hd2_copy_time(hash_folder_path,
                                                       delta.total_seconds())
        else:
            self.status.set_not_copied()

    def update_status(self, file_name, original_hash, status):
        if status == FileStatus.INITIAL:
            self.status.add_file()
Exemple #10
0
class Metadata:
    def __init__(self, file_hash=None):
        self.config = Config()
        self.storage = Storage()
        self.process_status = Status()
        self.metadata_utils = Metadata_Utils()
        self.path_hd1 = self.storage.hd1()
        self.data = self.default_data()
        self.file_hash = file_hash
        #self.time_field    =

    def get_from_file(self):  # todo: refactor out this method
        self.load()
        return self.data

    def load(self):
        with open(self.metadata_file_path()) as json_file:
            self.data = json.load(json_file)
        return self

    def add_file(self, file_path):
        if file_exists(file_path):
            tik = datetime.now()

            self.set_file_hash(self.metadata_utils.file_hash(file_path))

            tok = datetime.now()
            delta = tok - tik
            self.set_file_hash_calculation_time(delta.total_seconds())

            if self.exists():
                self.get_from_file()
            else:
                self.create(file_path)
            self.add_file_path(file_path)
            self.save()
            return self.file_hash

    def add_file_path(self, file_path: str):
        if self.file_hash:
            file_paths = self.data.get('original_file_paths')
            if 0 == len(file_paths):
                self.process_status.add_to_be_processed()
            if file_path.startswith(
                    self.path_hd1):  # check if path starts with hd1
                file_path = os.path.relpath(file_path, self.path_hd1)
            if file_path not in file_paths:
                file_paths.append(file_path)
            return file_paths

    def create(self, file_path):
        if self.file_hash:
            folder_create(self.metadata_folder_path())
            file_copy(file_path, self.source_file_path())
            self.set_original_file_size(file_path)
            self.set_original_file_extension(file_path)
            self.set_original_file_name(file_path)

    def default_data(self):
        return {
            'file_name': None,
            'xml_report_status': None,
            'last_update_time': None,
            'rebuild_server': None,
            'server_version': None,
            'error': None,
            'original_file_paths': [],
            'original_hash': None,
            'original_hash_calculation_time': None,
            'original_file_extension': None,
            'original_file_size': None,
            'rebuild_file_path': None,
            'rebuild_hash': None,
            'rebuild_status': FileStatus.INITIAL,
            'rebuild_file_extension': None,
            'rebuild_file_size': None,
            'rebuild_file_duration': None,
            'f2f_plugin_version': None,
            'f2f_plugin_git_commit': None,
            'hd1_to_hd2_copy_time': None,
            'hd2_to_hd3_copy_time': None
        }

    def delete(self):
        if self.exists():
            folder_delete_all(self.metadata_folder_path())
            return self.exists() is False
        return False

    def exists(self):
        return folder_exists(self.metadata_folder_path())

    def metadata_file_exists(self):
        return file_exists(self.metadata_file_path())

    def metadata_file_path(self):
        if self.file_hash:  # todo: find a better solution that having to add this to all methods
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_METADATA_FILENAME)

    def metadata_folder_path(self):
        if not self.file_hash:
            return

        path = self.storage.hd2_not_processed(self.file_hash)
        if folder_exists(path):
            return path

        path = self.storage.hd2_processed(self.file_hash)
        if folder_exists(path):
            return path

        # never processed - must be in the 'todo' folder
        path = self.storage.hd2_data(self.file_hash)
        return path

    def is_in_todo(self):
        folder_exists(self.storage.hd2_data(self.file_hash))

    def is_in_processed(self):
        folder_exists(self.storage.hd2_processed(self.file_hash))

    def is_in_not_processed(self):
        folder_exists(self.storage.hd2_not_processed(self.file_hash))

    def save(self):
        if self.exists():
            json_save_file_pretty(python_object=self.data,
                                  path=self.metadata_file_path())

    def update_field(
        self, field, updated_value
    ):  # todo: optimise this if we get performance hits due to multiple updates
        self.data[field] = updated_value
        self.data['last_update_time'] = datetime_now()
        self.save()

    def set_file_hash(self, file_hash):
        self.file_hash = file_hash
        self.data['original_hash'] = file_hash
        self.data['last_update_time'] = datetime_now()
        if not self.exists():
            self.save()

    def set_file_hash_calculation_time(self, seconds):
        self.data['original_hash_calculation_time'] = seconds

    def set_original_file_name(self, file_path):
        original_file_name = file_name(file_path)
        self.update_field('file_name', original_file_name)

    def set_original_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        self.update_field('original_file_size', file_size)

    def set_original_file_extension(self, file_path):
        extension = pathlib.Path(file_path).suffix
        self.update_field('original_file_extension', extension)

    def source_file_path(self):
        if self.file_hash:
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_SOURCE_FILENAME)

    def get_original_hash(self):
        return self.data.get('original_hash')

    def get_file_hash(self):
        return self.file_hash

    def get_file_name(self):
        return self.data.get('file_name')

    def get_rebuild_status(self):
        return self.data.get('rebuild_status')

    def get_original_file_paths(self):
        return self.data.get('original_file_paths')

    def get_last_update_time(self):
        return self.data.get('last_update_time')

    def get_error(self):
        return self.data.get('error')

    def get_original_file_extension(self):
        return self.data.get('original_file_extension')

    def report_file_path(self):
        if self.file_hash:
            return path_combine(self.metadata_folder_path(),
                                DEFAULT_REPORT_FILENAME)

    def report_file_exists(self):
        return file_exists(self.report_file_path())