def output_to_memory(data: Any, name: Optional[str], disk_fallback: bool = True) -> None: """Outputs data to memory. To manage outputing the data to memory for the user, this function uses metadata to add info to objects inside the plasma store. Args: data: Data to output. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. disk_fallback: If True, then outputing to disk is used when the `data` does not fit in memory. If False, then a :exc:`MemoryError` is thrown. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. MemoryError: If the `data` does not fit in memory and ``disk_fallback=False``. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. PipelineDefinitionNotFoundError: If the pipeline definition file could not be found. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot set the correct ID to identify the data in the memory store. Example: >>> data = "Data I would like to use in my next step" >>> output_to_memory(data, name="my_data") Note: Calling :meth:`output_to_memory` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine where to output data to.") # Serialize the object and collect the serialization metadata. obj, serialization = _serialize(data) try: client = _PlasmaConnector().client except error.OrchestNetworkError as e: if not disk_fallback: raise error.OrchestNetworkError(e) return output_to_disk(obj, name, serialization=serialization) # Try to output to memory. obj_id = _convert_uuid_to_object_id(step_uuid) metadata = [ str(Config.IDENTIFIER_SERIALIZATION), # The plasma store allows to get the creation timestamp, but # creating it this way makes the process more consistent with # the metadata we are writing when outputting to disk, moreover, # it makes the code less dependent on the plasma store API. datetime.utcnow().isoformat(timespec="seconds"), serialization.name, # Can't simply assign to name beforehand because name might be # passed to output_to_disk, which needs to check for name # validity itself since its a public function. name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR, ] metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8") try: obj_id = _output_to_memory(obj, client, obj_id=obj_id, metadata=metadata) except MemoryError: if not disk_fallback: raise MemoryError("Data does not fit in memory.") # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, name, serialization=serialization) return
def get_inputs(ignore_failure: bool = False, verbose: bool = False) -> Dict[str, Any]: """Gets all data sent from incoming steps. Args: ignore_failure: If True then the returned result can have ``None`` values if the data of a step could not be retrieved. If False, then this function will fail if any of the incoming steps's data could not be retrieved. Example: ``[None, "Hello World!"]`` vs :exc:`OutputNotFoundError` verbose: If True print all the steps from which the current step has retrieved data. Returns: Dictionary with input data for this step. We differentiate between two cases: * Named data, which is data that was outputted with a `name` by any parent step. Named data can be retrieved through the dictionary by its name, e.g. ``data = get_inputs()["my_name"]``. Name collisions will raise an :exc:`InputNameCollisionError`. * Unnamed data, which is an ordered list containing all the data that was outputted without a name by the parent steps. Unnamed data can be retrieved by accessing the reserved ``"unnamed"`` key. The order of this list depends on the order of the parent steps of the node, which is visible through the GUI. Example:: # It does not matter how the data was output by parent # steps. It is resolved automatically by the get_inputs # method. { "unnamed" : ["Hello World!", (3, 4)], "named_1" : "mystring", "named_2" : [1, 2, 3] } Raises: InputNameCollisionError: Multiple steps have outputted data with the same name. OutputNotFoundError: If no output can be found of the given `step_uuid`. Either no output was generated or the in-memory object store died (and therefore lost all its data). StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine what inputs to get. Warning: Only call :meth:`get_inputs` once! When auto eviction is configured data might no longer be available. Either cache the data or maintain a copy yourself. """ try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine from where to get data.") collisions_dict = defaultdict(list) get_output_methods = [] # Check for collisions before retrieving any data. for parent in pipeline.get_step_by_uuid(step_uuid).parents: # For each parent get what function to use to retrieve its # output data and metadata related to said data. parent_uuid = parent.properties["uuid"] try: get_output_method, args, kwargs, metadata = _resolve( parent_uuid, consumer=step_uuid) except error.OutputNotFoundError: parent_title = parent.properties["title"] msg = (f'Output from incoming step "{parent_title}" ' f'("{parent_uuid}") cannot be found. Try rerunning it.') raise error.OutputNotFoundError(msg) # Maintain the output methods in order, but wait with calling # them so that we can first check for collisions. get_output_methods.append( (parent, get_output_method, args, kwargs, metadata)) if metadata["name"] != Config._RESERVED_UNNAMED_OUTPUTS_STR: collisions_dict[metadata["name"]].append( parent.properties["title"]) # If there are collisions raise an error. collisions_dict = {k: v for k, v in collisions_dict.items() if len(v) > 1} if collisions_dict: msg = "".join([ f"\n{name}: {sorted(step_names)}" for name, step_names in collisions_dict.items() ]) raise error.InputNameCollisionError( f"Name collisions between input data coming from different steps: {msg}" ) # TODO: maybe instead of for loop we could first get the receive # method and then do batch receive. For example memory allows # to do get_buffers which operates in batch. # NOTE: the order in which the `parents` list is traversed is # indirectly set in the UI. The order is important since it # determines the order in which unnamed inputs are received in # the next step. data = {Config._RESERVED_UNNAMED_OUTPUTS_STR: []} # type: Dict[str, Any] for parent, get_output_method, args, kwargs, metadata in get_output_methods: # Either raise an error on failure of getting output or # continue with other steps. try: incoming_step_data = get_output_method(*args, **kwargs) except error.OutputNotFoundError as e: if not ignore_failure: raise error.OutputNotFoundError(e) incoming_step_data = None if verbose: parent_title = parent.properties["title"] if incoming_step_data is None: print(f'Failed to retrieve input from step: "{parent_title}"') else: print(f'Retrieved input from step: "{parent_title}"') # Populate the return dictionary, where nameless data gets # appended to a list and named data becomes a (name, data) pair. name = metadata["name"] if name == Config._RESERVED_UNNAMED_OUTPUTS_STR: data[Config._RESERVED_UNNAMED_OUTPUTS_STR].append( incoming_step_data) else: data[name] = incoming_step_data return data
def output_to_disk(data: Any, name: Optional[str], serialization: Optional[Serialization] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. serialization: Serialization of the `data` in case it is already serialized. For possible values see :class:`Serialization`. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. PipelineDefinitionNotFoundError: If the pipeline definition file could not be found. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = "Data I would like to use in my next step" >>> output_to_disk(data, name="my_data") Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise error.DataInvalidNameError(e) if name is None: name = Config._RESERVED_UNNAMED_OUTPUTS_STR try: with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) except FileNotFoundError: raise error.PipelineDefinitionNotFoundError( f"Could not open {Config.PIPELINE_DEFINITION_PATH}.") pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except error.StepUUIDResolveError: raise error.StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = _serialize(data) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: metadata = [ datetime.utcnow().isoformat(timespec="seconds"), serialization.name, name, ] metadata = Config.__METADATA_SEPARATOR__.join(metadata) f.write(metadata) # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)