def get_output_disk(step_uuid: str, serialization: str = "arrow") -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization of the output. Has to be specified in order to deserialize correctly. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _get_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.")
def _get_output_disk(step_uuid: str, serialization: str) -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization for the output. For possible values see :class:`Serialization`. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _deserialize_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.")
def _resolve_disk(step_uuid: str) -> Dict[str, Any]: """Returns information of the most recent write to disk. Resolves via the HEAD file the timestamp (that is used to determine the most recent write) and arguments to call the :meth:`get_output_disk` method. Args: step_uuid: The UUID of the step to resolve its most recent write to disk. Returns: Dictionary containing the information of the function to be called to get the most recent data from the step. Additionally, returns fill-in arguments for the function and metadata related to the data that would be retrieved. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) head_file = os.path.join(step_data_dir, "HEAD") try: with open(head_file, "r") as f: timestamp, serialization, name = f.read().split( Config.__METADATA_SEPARATOR__) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.") res = { "method_to_call": _get_output_disk, "method_args": (step_uuid, ), "method_kwargs": { "serialization": serialization }, "metadata": { "timestamp": timestamp, "serialization": serialization, "name": name, }, } return res
def resolve_disk(step_uuid: str) -> Dict[str, Any]: """Returns information of the most recent write to disk. Resolves via the HEAD file the timestamp (that is used to determine the most recent write) and arguments to call the :meth:`get_output_disk` method. Args: step_uuid: The UUID of the step to resolve its most recent write to disk. Returns: Dictionary containing the information of the function to be called to get the most recent data from the step. Additionally, returns fill-in arguments for the function. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. """ step_data_dir = Config.get_step_data_dir(step_uuid) head_file = os.path.join(step_data_dir, 'HEAD') try: with open(head_file, 'r') as f: timestamp, serialization = f.read().split(', ') except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' 'Try rerunning it.') res = { 'timestamp': timestamp, 'method_to_call': get_output_disk, 'method_args': (step_uuid, ), 'method_kwargs': { 'serialization': serialization } } return res