def _get_output_disk(step_uuid: str, serialization: str) -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization for the output. For possible values see :class:`Serialization`. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. DeserializationError: If the data could not be deserialized. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _deserialize_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise error.DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.") # IOError is to try to catch pyarrow failures on opening the file. except (pickle.UnpicklingError, IOError): raise error.DeserializationError( f'Output from incoming step "{step_uuid}" ({full_path}) ' "could not be deserialized.")
def get_output_disk(step_uuid: str, serialization: str = "arrow") -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization of the output. Has to be specified in order to deserialize correctly. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _get_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.")
def _get_output_disk(step_uuid: str, serialization: str) -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization for the output. For possible values see :class:`Serialization`. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _deserialize_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.")
def _resolve_disk(step_uuid: str) -> Dict[str, Any]: """Returns information of the most recent write to disk. Resolves via the HEAD file the timestamp (that is used to determine the most recent write) and arguments to call the :meth:`get_output_disk` method. Args: step_uuid: The UUID of the step to resolve its most recent write to disk. Returns: Dictionary containing the information of the function to be called to get the most recent data from the step. Additionally, returns fill-in arguments for the function and metadata related to the data that would be retrieved. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) head_file = os.path.join(step_data_dir, "HEAD") try: with open(head_file, "r") as f: timestamp, serialization, name = _interpret_metadata(f.read()) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise error.DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.") res = { "method_to_call": _get_output_disk, "method_args": (step_uuid, ), "method_kwargs": { "serialization": serialization }, "metadata": { "timestamp": timestamp, "serialization": serialization, "name": name, }, } return res
def resolve_disk(step_uuid: str) -> Dict[str, Any]: """Returns information of the most recent write to disk. Resolves via the HEAD file the timestamp (that is used to determine the most recent write) and arguments to call the :meth:`get_output_disk` method. Args: step_uuid: The UUID of the step to resolve its most recent write to disk. Returns: Dictionary containing the information of the function to be called to get the most recent data from the step. Additionally, returns fill-in arguments for the function. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) head_file = os.path.join(step_data_dir, 'HEAD') try: with open(head_file, 'r') as f: timestamp, serialization = f.read().split(', ') except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' 'Try rerunning it.') res = { 'timestamp': timestamp, 'method_to_call': get_output_disk, 'method_args': (step_uuid, ), 'method_kwargs': { 'serialization': serialization } } return res
def output_to_disk(data: Any, pickle_fallback: bool = True, serialization: Optional[str] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. pickle_fallback: This option is passed to :meth:`serialize`. If ``pyarrow`` cannot serialize the data, then it will fall back to using ``pickle``. This is helpful for custom data types. serialization: Serialization of the `data` in case it is already serialized. Currently supported values are: ``['arrow', 'arrowpickle']``. Raises: StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = 'Data I would like to use in my next step' >>> output_to_disk(data) Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output. Generally speaking you therefore want to be only calling the function once. """ with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = serialize(data, pickle_fallback=pickle_fallback) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: current_time = datetime.utcnow() f.write( f'{current_time.isoformat(timespec="seconds")}, {serialization}') # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)
def output_to_disk(data: Any, name: Optional[str], serialization: Optional[Serialization] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. serialization: Serialization of the `data` in case it is already serialized. For possible values see :class:`Serialization`. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. PipelineDefinitionNotFoundError: If the pipeline definition file could not be found. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = "Data I would like to use in my next step" >>> output_to_disk(data, name="my_data") Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) if name is None: name = Config._RESERVED_UNNAMED_OUTPUTS_STR try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = _serialize(data) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: metadata = [ datetime.utcnow().isoformat(timespec="seconds"), serialization.name, name, ] metadata = Config.__METADATA_SEPARATOR__.join(metadata) f.write(metadata) # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)