def output( data: Any, name: Optional[str], ) -> None: """Outputs data so that it can be retrieved by the next step. Note: Calling :meth:`output` multiple times within the same step will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. Args: data: Data to output. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. StepUUIDResolveError: The step's UUID cannot be resolved and thus data cannot be outputted. Example: >>> data = "Data I would like to use in my next step" >>> output(data, name="my_data") """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) return output_to_disk( data, name, )
def output_to_memory(data: Any, name: Optional[str], disk_fallback: bool = True) -> None: """Outputs data to memory. To manage outputing the data to memory for the user, this function uses metadata to add info to objects inside the plasma store. Args: data: Data to output. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. disk_fallback: If True, then outputing to disk is used when the `data` does not fit in memory. If False, then a :exc:`MemoryError` is thrown. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. MemoryError: If the `data` does not fit in memory and ``disk_fallback=False``. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. PipelineDefinitionNotFoundError: If the pipeline definition file could not be found. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot set the correct ID to identify the data in the memory store. Example: >>> data = "Data I would like to use in my next step" >>> output_to_memory(data, name="my_data") Note: Calling :meth:`output_to_memory` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine where to output data to.") # Serialize the object and collect the serialization metadata. obj, serialization = _serialize(data) try: client = _PlasmaConnector().client except error.OrchestNetworkError as e: if not disk_fallback: raise error.OrchestNetworkError(e) return output_to_disk(obj, name, serialization=serialization) # Try to output to memory. obj_id = _convert_uuid_to_object_id(step_uuid) metadata = [ str(Config.IDENTIFIER_SERIALIZATION), # The plasma store allows to get the creation timestamp, but # creating it this way makes the process more consistent with # the metadata we are writing when outputting to disk, moreover, # it makes the code less dependent on the plasma store API. datetime.utcnow().isoformat(timespec="seconds"), serialization.name, # Can't simply assign to name beforehand because name might be # passed to output_to_disk, which needs to check for name # validity itself since its a public function. name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR, ] metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8") try: obj_id = _output_to_memory(obj, client, obj_id=obj_id, metadata=metadata) except MemoryError: if not disk_fallback: raise MemoryError("Data does not fit in memory.") # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, name, serialization=serialization) return
def output_to_disk(data: Any, name: Optional[str], serialization: Optional[Serialization] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. serialization: Serialization of the `data` in case it is already serialized. For possible values see :class:`Serialization`. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. PipelineDefinitionNotFoundError: If the pipeline definition file could not be found. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = "Data I would like to use in my next step" >>> output_to_disk(data, name="my_data") Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) if name is None: name = Config._RESERVED_UNNAMED_OUTPUTS_STR try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = _serialize(data) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: metadata = [ datetime.utcnow().isoformat(timespec="seconds"), serialization.name, name, ] metadata = Config.__METADATA_SEPARATOR__.join(metadata) f.write(metadata) # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)