Example #1
0
class File_Processing:
    def __init__(self, events_log, events_elastic, report_elastic,
                 analysis_elastic, meta_service):
        self.meta_service = meta_service
        self.events_log = events_log
        self.events_elastic = events_elastic
        self.storage = Storage()
        self.config = Config()
        self.status = Status()
        self.hash_json = Hash_Json()
        self.report_elastic = report_elastic
        self.sdk_api_version = "Not available"
        self.sdk_engine_version = "Not available"

        self.analysis_json = Analysis_Json()
        self.analysis_elastic = analysis_elastic

    def add_event_log(self, message, event_data={}):
        json_data = self.events_log.add_log(message, event_data)
        self.events_elastic.add_event_log(json_data)

    def base64request(self, endpoint, api_route, base64enc_file):
        try:
            url = endpoint + "/" + api_route

            payload = json.dumps({"Base64": base64enc_file})

            headers = {'Content-Type': 'application/json'}

            return requests.request("POST",
                                    url,
                                    headers=headers,
                                    data=payload,
                                    timeout=int(self.config.request_timeout))

        except Exception as e:
            log_error(str(e))
            raise ValueError(str(e))

    def xmlreport_request(self, endpoint, fileID):
        try:
            url = endpoint + "/api/Analyse/xmlreport?fileId=" + fileID

            payload = ""
            headers = {'Content-Type': 'application/octet-stream'}

            response = requests.request("GET",
                                        url,
                                        headers=headers,
                                        data=payload,
                                        timeout=int(
                                            self.config.request_timeout))
            return response.text

        except Exception as e:
            raise ValueError(str(e))

    def rebuild(self, endpoint, base64enc_file):
        return self.base64request(endpoint, "api/rebuild/base64",
                                  base64enc_file)

    def get_xmlreport(self, endpoint, fileId, dir):
        log_info(message=f"getting XML Report for {fileId} at {endpoint}")

        xmlreport = self.xmlreport_request(endpoint, fileId)
        if not xmlreport:
            raise ValueError('Failed to obtain the XML report')

        try:
            json_obj = xmltodict.parse(xmlreport)

            file_extension = json_obj["gw:GWallInfo"]["gw:DocumentStatistics"][
                "gw:DocumentSummary"]["gw:FileType"]
            self.meta_service.set_rebuild_file_extension(dir, file_extension)
            json_obj['original_hash'] = os.path.basename(dir)
            json_save_file_pretty(json_obj, os.path.join(dir, "report.json"))

            #self.report_elastic.add_report(json_obj)

            analysis_obj = self.analysis_json.get_file_analysis(
                os.path.basename(dir), json_obj)
            json_save_file_pretty(analysis_obj,
                                  os.path.join(dir, "analysis.json"))

            self.analysis_elastic.add_analysis(analysis_obj)

            return True
        except Exception as error:
            log_error(
                message=f"Error in parsing xmlreport for {fileId} : {error}")
            return False

    # Save to HD3
    def save_file(self, result, processed_path):
        self.add_event_log('Saving to: ' + processed_path)

        dirname = ntpath.dirname(processed_path)
        basename = ntpath.basename(processed_path)
        folder_create(dirname)

        decoded = FileService.base64decode(result)

        if decoded:
            FileService.wrtie_binary_file(dirname, basename, decoded)
            self.add_event_log('The decoded file has been saved')
            return processed_path
        else:
            FileService.wrtie_file(
                dirname, basename + ".html",
                result)  # todo: capture better this workflow
            self.add_event_log('Decoding FAILED. The HTML file has been saved')
            return processed_path + '.html'  # todo: refactor this workflow and how this is calculated

    @log_duration
    def do_rebuild(self, endpoint, hash, source_path, dir):
        log_info(
            message=f"Starting rebuild for file {hash} on endpoint {endpoint}")
        with Duration() as duration:
            event_data = {
                "endpoint": endpoint,
                "hash": hash,
                "source_path": source_path,
                "dir": dir
            }  # todo: see if we can use a variable that holds the params data
            self.add_event_log('Starting File rebuild', event_data)

            self.meta_service.set_rebuild_server(dir, endpoint)

            encodedFile = FileService.base64encode(source_path)
            if not encodedFile:
                message = f"Failed to encode the file: {hash}"
                log_error(message=message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False

            response = self.rebuild(endpoint, encodedFile)
            result = response.text
            if not result:
                message = f"Failed to rebuild the file : {hash}"
                log_error(message=message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False

            try:
                for path in self.meta_service.get_original_file_paths(dir):
                    #rebuild_file_path = path
                    if path.startswith(self.config.hd1_location):
                        rebuild_file_path = path.replace(
                            self.config.hd1_location, self.config.hd3_location)
                    else:
                        rebuild_file_path = os.path.join(
                            self.config.hd3_location, path)

                    folder_create(parent_folder(
                        rebuild_file_path))  # make sure parent folder exists

                    final_rebuild_file_path = self.save_file(
                        result, rebuild_file_path
                    )  # returns actual file saved (which could be .html)

                    # todo: improve the performance of these update since each will trigger a save
                    file_size = os.path.getsize(
                        final_rebuild_file_path)  # calculate rebuilt file fize
                    rebuild_hash = self.meta_service.file_hash(
                        final_rebuild_file_path
                    )  # calculate hash of final_rebuild_file_path

                    self.meta_service.set_rebuild_file_size(dir, file_size)
                    self.meta_service.set_rebuild_file_path(
                        dir, final_rebuild_file_path
                    )  # capture final_rebuild_file_path
                    self.meta_service.set_rebuild_hash(
                        dir, rebuild_hash)  # capture it
                if not FileService.base64decode(result):
                    message = f"Engine response could not be decoded"
                    log_error(message=message, data=f"{result}")
                    self.meta_service.set_error(dir, message)
                    return False
            except Exception as error:
                message = f"Error Saving file for {hash} : {error}"
                log_error(message=message)
                self.meta_service.set_xml_report_status(dir, "No Report")
                self.meta_service.set_error(dir, message)
                return False

            headers = response.headers
            fileIdKey = "X-Adaptation-File-Id"

            # get XML report
            if fileIdKey in headers:
                if self.get_xmlreport(endpoint, headers[fileIdKey], dir):
                    self.add_event_log('The XML report has been saved')
                    self.meta_service.set_xml_report_status(dir, "Obtained")
                else:
                    self.meta_service.set_xml_report_status(
                        dir, "No XML Report")
            else:
                self.meta_service.set_xml_report_status(
                    dir, "Failed to obtain")
                message = f'No X-Adaptation-File-Id header found in the response for {hash}'
                log_error(message)
                self.add_event_log(message)
                self.meta_service.set_error(dir, message)
                return False
                #raise ValueError("No X-Adaptation-File-Id header found in the response")

            # todo: add when server side supports this
            # SDKEngineVersionKey = "X-SDK-Engine-Version"
            # SDKAPIVersionKey = "X-SDK-Api-Version"
            #
            # if SDKEngineVersionKey in headers:
            #     self.sdk_engine_version = headers[SDKEngineVersionKey]
            # if SDKAPIVersionKey in headers:
            #     self.sdk_api_version = headers[SDKAPIVersionKey]
            #
            # self.meta_service.set_server_version(dir, "Engine:" + self.sdk_engine_version + " API:" + self.sdk_api_version )
        log_info(
            message=
            f"rebuild ok for file {hash} on endpoint {endpoint} took {duration.seconds()} seconds"
        )
        return True

    @log_duration
    def processDirectory(self, endpoint, dir):
        self.add_event_log("Processing Directory: " + dir)
        hash = ntpath.basename(dir)
        if len(hash) != 64:
            self.add_event_log("Unexpected hash length")
            #raise ValueError("Unexpected hash length")
            return False

        metadata_file_path = os.path.join(dir,
                                          Metadata_Service.METADATA_FILE_NAME)
        if not (FileService.file_exist(metadata_file_path)):
            self.add_event_log("The metadate.json file does not exist")
            #raise ValueError("The metadate.json file does not exist")
            return False

        if self.meta_service.is_completed_status(dir):
            self.add_event_log("Metadata is in the COMPLETED state")
            return False

        self.add_event_log("Set metadata status IN_PROGRESS")
        self.meta_service.set_status_inprogress(dir)
        self.status.add_in_progress()

        source_path = os.path.join(dir, "source")
        if not (FileService.file_exist(source_path)):
            self.add_event_log("File does not exist")
            #raise ValueError("File does not exist")
            return False

        self.add_event_log("Sending to rebuild")
        tik = datetime.now()
        status = self.do_rebuild(endpoint, hash, source_path, dir)
        #        if status:
        #            self.meta_service.set_status(dir, FileStatus.COMPLETED)
        #            self.meta_service.set_error(dir, "none")
        #        else:
        if not status:
            self.meta_service.set_status(dir, FileStatus.FAILED)
            self.hash_json.update_status(hash, FileStatus.FAILED)

        tok = datetime.now()
        delta = tok - tik
        self.meta_service.set_rebuild_file_duration(dir, delta.total_seconds())

        return status
Example #2
0
class Loops(object):

    continue_processing = False
    processing_started = False
    lock = asyncio.Lock()

    def __init__(self):
        self.use_es = False
        self.config = Config()
        self.status = Status()
        self.storage = Storage()
        self.hash_json = Hash_Json()
        self.events = Events_Log(self.config.hd2_status_location)
        self.events_elastic = Events_Log_Elastic()
        self.hash = None
        self.report_elastic = Report_Elastic()
        self.analysis_elastic = Analysis_Elastic()
        self.report_elastic.setup()
        self.analysis_elastic.setup()
        create_folder(self.storage.hd2_processed())
        create_folder(self.storage.hd2_not_processed())

    def IsProcessing(self):
        return Loops.processing_started

    def StopProcessing(self):
        Loops.continue_processing = False

    def HasBeenStopped(self):
        return not Loops.continue_processing

    def git_commit(self):
        git_commit = 'Not available'
        try:
            git_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'
                                                  ]).decode("utf-8").rstrip()
        except Exception as e:
            pass

        return git_commit

    def ProcessDirectoryWithEndpoint(self, itempath, file_hash,
                                     endpoint_index):

        if not os.path.isdir(itempath):
            return False

        log_info(
            message=
            f"Starting ProcessDirectoryWithEndpoint on endpoint # {endpoint_index} for file {file_hash}"
        )
        meta_service = Metadata_Service()
        original_file_path = meta_service.get_original_file_paths(itempath)
        events = Events_Log(itempath)

        endpoint = "http://" + self.config.endpoints['Endpoints'][
            endpoint_index]['IP'] + ":" + self.config.endpoints['Endpoints'][
                endpoint_index]['Port']
        events.add_log("Processing with: " + endpoint)

        meta_service.set_f2f_plugin_version(itempath, API_VERSION)
        meta_service.set_f2f_plugin_git_commit(itempath, self.git_commit())

        try:
            file_processing = File_Processing(events, self.events_elastic,
                                              self.report_elastic,
                                              self.analysis_elastic,
                                              meta_service)
            if not file_processing.processDirectory(endpoint, itempath):
                events.add_log("CANNOT be processed")
                return False

            log_data = {
                'file': original_file_path,
                'status': FileStatus.COMPLETED,
                'error': 'none',
                'timestamp': datetime.now(),
            }
            log_info('ProcessDirectoryWithEndpoint', data=log_data)
            meta_service.set_error(itempath, "none")
            meta_service.set_status(itempath, FileStatus.COMPLETED)
            self.hash_json.update_status(file_hash, FileStatus.COMPLETED)
            events.add_log("Has been processed")
            return True
        except Exception as error:
            log_data = {
                'file': original_file_path,
                'status': FileStatus.FAILED,
                'error': str(error),
            }
            log_error(message='error in ProcessDirectoryWithEndpoint',
                      data=log_data)
            meta_service.set_error(itempath, str(error))
            meta_service.set_status(itempath, FileStatus.FAILED)
            self.hash_json.update_status(file_hash, FileStatus.FAILED)
            events.add_log("ERROR:" + str(error))
            return False

    def ProcessDirectory(self, thread_data):
        (itempath, file_hash, process_index) = thread_data
        endpoint_index = process_index % self.config.endpoints_count
        if not Loops.continue_processing:
            return False
        tik = datetime.now()
        process_result = self.ProcessDirectoryWithEndpoint(
            itempath, file_hash, endpoint_index)

        if process_result:
            self.status.add_completed()

            tok = datetime.now()
            delta = tok - tik

            meta_service = Metadata_Service()
            meta_service.set_hd2_to_hd3_copy_time(itempath,
                                                  delta.total_seconds())
        else:
            self.status.add_failed()

        return process_result

        # note: removing retries from this method (it should not be handled like this
        #for idx in range(self.config.endpoints_count):
        #    if self.ProcessDirectoryWithEndpoint(itempath, file_hash, endpoint_index):
        #        return
        #    # The Endpoint failed to process the file
        #    # Retry it with the next one
        #    endpoint_index = (endpoint_index + 1) % self.config.endpoints_count

    def updateHashJson(self):
        self.hash_json.reset()
        meta_service = Metadata_Service()

        for hash_folder in os.listdir(self.storage.hd2_data()):

            metadata_folder = self.storage.hd2_data(hash_folder)

            if not os.path.isdir(metadata_folder):
                continue

            metadata = meta_service.get_from_file(metadata_folder)
            file_name = metadata.get_file_name()
            original_hash = metadata.get_original_hash()
            status = metadata.get_rebuild_status()

            if status != FileStatus.COMPLETED:
                self.hash_json.add_file(original_hash, file_name)

        self.hash_json.save()
        self.status.set_processing_counters(len(self.hash_json.data()))
        return self.hash_json.data()

    def moveProcessedFiles(self):
        json_list = self.hash_json.data()

        for key in json_list:

            source_path = self.storage.hd2_data(key)

            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                destination_path = self.storage.hd2_processed(key)

                if folder_exists(destination_path):
                    folder_delete_all(destination_path)

                shutil.move(source_path, destination_path)

            if (FileStatus.FAILED == json_list[key]["file_status"]):

                meta_service = Metadata_Service()
                meta_service.get_from_file(source_path)
                metadata = meta_service.metadata
                if ("Engine response could not be decoded" == metadata.get_error()) and \
                    metadata.get_original_file_extension() in ['.xml', '.json']:
                    destination_path = self.storage.hd2_not_processed(key)

                    if folder_exists(destination_path):
                        folder_delete_all(destination_path)

                    shutil.move(source_path, destination_path)

    def LoopHashDirectoriesInternal(self, thread_count, do_single):

        if folder_exists(self.storage.hd2_data()) is False:
            log_message = "ERROR: rootdir does not exist: " + self.storage.hd2_data(
            )
            log_error(log_message)
            return False

        if not isinstance(thread_count, int):
            raise TypeError("thread_count must be a integer")

        if not isinstance(do_single, bool):
            raise TypeError("thread_count must be a integer")

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        json_list = self.updateHashJson()

        log_message = f"LoopHashDirectoriesInternal started with {thread_count} threads"
        self.events.add_log(log_message)
        log_info(log_message)

        threads = list()

        process_index = 0

        log_info(
            message=f'before Mapping thread_data for {len(json_list)} files')
        thread_data = []
        for key in json_list:
            file_hash = key

            itempath = self.storage.hd2_data(key)
            if (FileStatus.COMPLETED == json_list[key]["file_status"]):
                self.events.add_log(
                    f"The file processing has been already completed")
                continue

            if not os.path.exists(itempath):
                self.events.add_log(
                    f"ERROR: Path \"{itempath}\" does not exist")
                json_list[key]["file_status"] = FileStatus.FAILED
                continue

            process_index += 1
            thread_data.append((
                itempath,
                file_hash,
                process_index,
            ))
            # # limit the number of parallel threads
            #
            # if process_index % int(thread_count) == 0:                      # todo: refactor this workflow to use multiprocess and queues
            #     # Clean up the threads
            #     for index, thread in enumerate(threads):                    # todo: since at the moment this will block allocating new threads until
            #         thread.join()                                           #       all have finishing execution
            #
            # process_index += 1
            # log_info(message=f"in LoopHashDirectoriesInternal process_index={process_index} , thread #{process_index % int(thread_count) }")
            # x = threading.Thread(target=self.ProcessDirectory, args=(itempath, file_hash, process_index,))
            # threads.append(x)
            # x.start()
            #
            # if do_single:
            #     break
            #
            # if not Loops.continue_processing:
            #     break

        # for index, thread in enumerate(threads):
        #     thread.join()

        log_info(
            message=
            f'after mapped thread_data, there are {len(thread_data)} mapped items'
        )
        #thread_data = thread_data[:500]
        #log_info(message=f'to start with only processing {len(thread_data)} thread_data items')
        pool = ThreadPool(thread_count)
        results = pool.map(self.ProcessDirectory, thread_data)
        pool.close()
        pool.join()

        self.moveProcessedFiles()

        self.events.add_log("LoopHashDirectoriesInternal finished")
        return True

    async def LoopHashDirectoriesAsync(self, thread_count, do_single=False):
        await Loops.lock.acquire()
        try:
            Loops.continue_processing = True
            Loops.processing_started = True
            self.status.set_started()
            self.LoopHashDirectoriesInternal(thread_count, do_single)
        finally:
            Loops.processing_started = False
            Loops.lock.release()
            self.status.set_stopped()
            self.hash_json.save()

    @log_duration
    def LoopHashDirectories(self, thread_count=None):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                message=
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        self.status.StartStatusThread()
        thread_count = thread_count or self.config.thread_count
        log_info(message="in LoopHashDirectories, about to start main loop")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(thread_count))
        log_info(message="in LoopHashDirectories, Loop completed")
        self.status.StopStatusThread()
        return True

    @log_duration
    def LoopHashDirectoriesSequential(self):
        #Allow only a single loop to be run at a time
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1))
        return True

    @log_duration
    def ProcessSingleFile(self):
        if self.IsProcessing():
            log_error(
                "ERROR: Attempt to start processing while processing is in progress"
            )
            return False

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.LoopHashDirectoriesAsync(1, True))
        return True
class test_Hash_Json(TestCase):

    test_file = None

    @classmethod
    def setUpClass(cls) -> None:
        cls.test_file      = temp_file(contents='Static text so that we have a static hash')
        cls.test_file_name = file_name(cls.test_file)
        cls.test_file_hash = '500286533bf75d769e9180a19414d1c3502dd52093e7351a0a9b1385d8f8961c'

    @classmethod
    def tearDownClass(cls) -> None:
        file_delete(cls.test_file)

    def setUp(self) -> None:
        self.hash_json = Hash_Json()
        self.storage   = self.hash_json.storage

    def test___init__(self):
        assert abspath(self.hash_json.folder()) == self.storage.hd2_status()

    @patch("multiprocessing.queues.Queue.put_nowait")
    def test_add_file(self, patch_log_error):
        hash_data = self.hash_json.load()
        if hash_data.get('self.test_file_hash'):
            del hash_data[self.test_file_hash]

        assert self.hash_json.add_file(self.test_file_hash, self.test_file_name) is True
        self.hash_json.save()
        assert hash_data.get(self.test_file_hash) == {'file_name': self.test_file_name, 'file_status': 'Initial'}

        assert self.hash_json.add_file('AAAA'              , self.test_file_name) is False
        assert self.hash_json.add_file(self.test_file_hash , None               ) is False
        assert self.hash_json.add_file(None                , None               ) is False

        assert patch_log_error.mock_calls == [call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': 'AAAA'             , 'file_name': self.test_file_name}, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}),
                                              call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': self.test_file_hash, 'file_name': None               }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'}),
                                              call({'level': 'ERROR', 'message': 'in Hash_Json.add_file bad data provided', 'data': {'file_hash': None               , 'file_name': None               }, 'duration': 0, 'from_method': 'add_file', 'from_class': 'Hash_Json'})]



    def test_get_file_path(self):
        file_path = abspath(self.hash_json.get_file_path())
        assert file_exists(file_path)
        assert file_path == path_combine(self.storage.hd2_status(), Hash_Json.HASH_FILE_NAME)

    def test_load(self):
        data = self.hash_json.load()
        assert type(data) is dict
        assert self.hash_json.data() == data

    def test_data(self):
        assert self.hash_json.data() == self.hash_json._hash_json_data

    def test_is_hash(self):
        test_file   = temp_file(contents='aaaa')
        file_hash   = Metadata_Utils().file_hash(test_file)                         # create hash from file
        text_hash   = str_sha256('asd')                                             # create hash from string

        assert self.hash_json.is_hash(file_hash         ) is True                   # confirm both are valid hashes
        assert self.hash_json.is_hash(text_hash         ) is True

        assert self.hash_json.is_hash(None              ) is False                  # testing all sorts of conner cases
        assert self.hash_json.is_hash(''                ) is False                  # empty strings
        assert self.hash_json.is_hash('aaaa'            ) is False                  # non hash string
        assert self.hash_json.is_hash(file_hash + 'aaaa') is False                  # confirm only exact matches work
        assert self.hash_json.is_hash(text_hash + 'aaaa') is False
        assert self.hash_json.is_hash('aaa' + file_hash ) is False
        assert self.hash_json.is_hash(text_hash + '\nb`') is False                  # confirm content in new lines is also not a match
        assert self.hash_json.is_hash('a\n' + file_hash ) is False

        file_delete(test_file)

    def test_save(self):
        target_file = temp_file()                                                   # temp file to save data
        assert file_not_exists(target_file)                                         # confirm it doesn't exist
        with patch.object(Hash_Json, 'get_file_path', return_value=target_file):    # patch get_file_path to return temp file path
            assert self.hash_json.get_file_path() == target_file                    # confirm patch is in place
            self.hash_json.save()                                          # call write_to_file
            assert file_exists(target_file)                                         # confirm temp file now exists
            assert self.hash_json.load() == self.hash_json.data()                     # confirm reloaded data is correct
            assert json_load_file(target_file)    == self.hash_json.data()            # also confirm using direct json load of temp file
        assert self.hash_json.get_file_path()     != target_file                    # confirm pathc is not there (after 'with' ends)
        file_delete(target_file)                                                    # delete temp file

    def test_update_status(self):
        temp_data_file = temp_file()
        with patch.object(Hash_Json, 'get_file_path', return_value=temp_data_file):
            self.hash_json.add_file(self.test_file_hash, self.test_file_name)
            assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'Initial'
            self.hash_json.update_status(self.test_file_hash, 'BBBB')
            self.hash_json.save()
            assert self.hash_json.data()[self.test_file_hash]['file_status'] == 'BBBB'
            assert json_load_file(temp_data_file)[self.test_file_hash]['file_status'] == 'BBBB'
        pprint(self.hash_json.load())

    def test_data_bug(self):                                            # this test confirms the bug
        hashes = self.hash_json.data()
        for hash in self.hash_json.data():
            if len(hash) == 64:                                         # all keys in this object should be a hash
                assert len(hash) == 64
                assert type(hashes[hash]) == dict                       # with all items being a dictionary
                assert list_set(hashes[hash]) == ['file_name', 'file_status']
            else:
                assert hash == "file_list"                              # but the old schema is still present
                assert type(hashes[hash]) == list                       # with the data being a list
                assert list_set(hashes[hash][0]) == ['file_name', 'file_status', 'hash', 'id']