Beispiel #1
0
    def replace(self, result: Result):
        if not self._results.get(result.id, None):
            raise Exception("replace_result",
                            f"Result with ID {result.id} is not known")

        # Write changes to file
        write_json_file(
            str(result.storage_path / self._config.result_filename),
            transform_dataclass_to_dict(result))

        # Replace result object
        self._results.pop(result.id)
        self._results[result.id] = result

        return result
Beispiel #2
0
    def replace(self, snap):
        if not self._snapshots.get(snap.id, None):
            raise Exception("replace_snapshot",
                            f"Snapshot with ID {snap.id} is not known")

        # Update snapshot file
        write_json_file(
            str(
                Path(snap.storage_path) /
                self._config.model_snapshot_filename),
            transform_dataclass_to_dict(snap))

        # Replace snapshot object
        self._snapshots.pop(snap.id)
        self._snapshots[snap.id] = snap

        return snap
Beispiel #3
0
    def create_result(self, result: Result) -> Union[Exception, Result]:
        if self._results.get(result.id, None):
            raise Exception("create_result",
                            f"Result with ID {result.id} is already known")

        # Create result on filesystem
        storage_path = self._config.platform_base_dir / self._config.result_base_dir / str(
            result.id)
        Path(storage_path).mkdir(parents=True)
        result = dataclasses.replace(result, storage_path=storage_path)

        write_json_file(str(storage_path / self._config.result_filename),
                        transform_dataclass_to_dict(result))

        # add result object
        self._results[result.id] = result

        return result
Beispiel #4
0
    def create_snapshot(
            self, snap: ModelSnapshot) -> Union[Exception, ModelSnapshot]:
        if self._snapshots.get(snap.id, None):
            raise Exception("create_snapshot",
                            f"Snapshot with ID {snap.id} is already known")

        # Create snapshot on filesystem
        Path(snap.storage_path).mkdir(parents=True)

        write_json_file(
            str(
                Path(snap.storage_path) /
                self._config.model_snapshot_filename),
            transform_dataclass_to_dict(snap))

        # add snap object
        self._snapshots[snap.id] = snap

        return snap
Beispiel #5
0
    def monitor_wait_container_execution_train(self, context: ModelSnapshot) -> ModelSnapshot:
        container_statistics = []
        log_container_running = ""

        # Get Container Object
        running_container = self._d.containers.get(context.container_id)

        # Wait for termination and collect performance statistics
        while running_container.status in CONTAINER_RUNNING_STATUSES:
            # Collect stats
            container_statistics += [running_container.stats(stream=False)]
            time.sleep(10)

            # Reload container data
            running_container.reload()

        context = dataclasses.replace(context, container_performance_statistics=container_statistics)

        # Container exited, check if success or failure
        container_info = dict()
        container_info["start_time"] = running_container.attrs["State"]["StartedAt"]
        container_info["end_time"] = running_container.attrs["State"]["FinishedAt"]
        container_info["exit_code"] = running_container.attrs["State"]["ExitCode"]
        container_info["exit_message"] = running_container.attrs["State"]["Error"]
        container_info["OOMKilled"] = running_container.attrs["State"]["OOMKilled"]
        container_info["Dead"] = running_container.attrs["State"]["Dead"]

        if container_info["OOMKilled"]:
            container_info["exit_message"] = "OOMKilled {}".format(container_info["exit_message"])
        elif container_info["Dead"]:
            container_info["exit_message"] = "Dead {}".format(container_info["exit_message"])

        context = dataclasses.replace(context, container_info=container_info)

        # Preserve the logs
        log_container_build = context.container_build_logs
        log_container_pull = context.container_pull_logs

        log_container_push = f"Started: {get_timestamp()}\n"
        log_container_push += context.container_push_logs

        log_container_prepare = f"Pull Log: \n{log_container_pull}\nBuild Log:\n{log_container_build}\nPush Log:\n{log_container_push}"

        # if pre processing was performed, add the logs to the email
        if context.container_pre_processing_logs:
            log_container_prepare += f"\nPre-processing Log: {context.container_pre_processing_logs}"

        # Write Build log
        write_to_file(Path(context.storage_path) / f"Container_Build_{get_timestamp(date_format='filename')}.log",
                      log_container_prepare)

        log_container_running += "Result: {}\n".format(True if container_info["exit_code"] == 0 else False)
        if container_info["exit_code"] != 0 and container_info["exit_message"]:
            log_container_running += "Errormessage: {}\n".format(container_info["exit_message"])
        log_container_running += "Started: {}\n".format(container_info["start_time"])
        log_container_running += "Finished: {}\n\nOutput:\n\n".format(container_info["end_time"])
        log_container_running += running_container.logs(timestamps=True).decode("utf-8")

        context = dataclasses.replace(context,
                                      container_run_logs=running_container.logs(timestamps=True).decode("utf-8"))

        # Write runtime log to disk
        write_to_file(Path(context.storage_path) / f"Container_Output_{get_timestamp(date_format='filename')}.log",
                      remove_ansi_escape_tags(log_container_running))

        # Write performance statistics to file
        write_json_file(filename=Path(context.storage_path) / f"Container_Performance_statistics.json",
                        content=container_statistics)

        # Always true for eval purpose
        if container_info["exit_code"] == 0:
            # if container_info["exit_code"] == 0 or True:
            context = dataclasses.replace(context, success=True)
            if not context.container_image_name.startswith('mon_'):
                # Send Email to inform user
                send_email_config(self._config,
                                  subject=f"Successful Model Training Pipeline: {context.id}",
                                  body="Congratulations!\n"
                                       "Your model training pipeline succeeded\n"
                                       "Check the attached logs for further details.\n",
                                  attachments=[{
                                      "Filename": f"Container-Build_{context.container_name}_{get_timestamp(date_format='filename')}.log",
                                      "Content": log_container_prepare
                                  }, {
                                      "Filename": f"Container-Output_{context.container_name}_{get_timestamp(date_format='filename')}.log",
                                      "Content": remove_ansi_escape_tags(log_container_running)
                                  }, {
                                      "Filename": f"Container-PerformanceStatistics_{context.container_name}_{get_timestamp(date_format='filename')}.json",
                                      "Content": json.dumps(container_statistics)
                                  }

                                  ])
        else:
            context = dataclasses.replace(context, success=False)
            send_email_config(self._config,
                              subject=f"Failed Model Training Pipeline: {context.container_name}",
                              body="Unfortunately, your training pipeline failed.\n"
                                   "Check the attached logs to solve the issues.\n"
                                   "If you have questions, please ask your administrator.",
                              attachments=[{
                                  "Filename": f"Container-Build_{context.container_name}_{get_timestamp(date_format='filename')}.log",
                                  "Content": log_container_prepare
                              }, {
                                  "Filename": f"Container-Output_{context.container_name}_{get_timestamp(date_format='filename')}.log",
                                  "Content": remove_ansi_escape_tags(log_container_running)
                              }
                              ])
        return context