Example #1
0
def output_to_memory(data: Any,
                     name: Optional[str],
                     disk_fallback: bool = True) -> None:
    """Outputs data to memory.

    To manage outputing the data to memory for the user, this function
    uses metadata to add info to objects inside the plasma store.

    Args:
        data: Data to output.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        disk_fallback: If True, then outputing to disk is used when the
            `data` does not fit in memory. If False, then a
            :exc:`MemoryError` is thrown.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        MemoryError: If the `data` does not fit in memory and
            ``disk_fallback=False``.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist.
            Which might be because the specified value was wrong or the
            store died.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot set the correct ID to identify the data in
            the memory store.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_memory(data, name="my_data")

    Note:
        Calling :meth:`output_to_memory` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # Serialize the object and collect the serialization metadata.
    obj, serialization = _serialize(data)

    try:
        client = _PlasmaConnector().client
    except error.OrchestNetworkError as e:
        if not disk_fallback:
            raise error.OrchestNetworkError(e)

        return output_to_disk(obj, name, serialization=serialization)

    # Try to output to memory.
    obj_id = _convert_uuid_to_object_id(step_uuid)
    metadata = [
        str(Config.IDENTIFIER_SERIALIZATION),
        # The plasma store allows to get the creation timestamp, but
        # creating it this way makes the process more consistent with
        # the metadata we are writing when outputting to disk, moreover,
        # it makes the code less dependent on the plasma store API.
        datetime.utcnow().isoformat(timespec="seconds"),
        serialization.name,
        # Can't simply assign to name beforehand because name might be
        # passed to output_to_disk, which needs to check for name
        # validity itself since its a public function.
        name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR,
    ]
    metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8")

    try:
        obj_id = _output_to_memory(obj,
                                   client,
                                   obj_id=obj_id,
                                   metadata=metadata)

    except MemoryError:
        if not disk_fallback:
            raise MemoryError("Data does not fit in memory.")

        # TODO: note that metadata is lost when falling back to disk.
        #       Therefore we will only support metadata added by the
        #       user, once disk also supports passing metadata.
        return output_to_disk(obj, name, serialization=serialization)

    return
Example #2
0
def get_inputs(ignore_failure: bool = False,
               verbose: bool = False) -> Dict[str, Any]:
    """Gets all data sent from incoming steps.

    Args:
        ignore_failure: If True then the returned result can have
            ``None`` values if the data of a step could not be
            retrieved. If False, then this function will fail if any of
            the incoming steps's data could not be retrieved. Example:
            ``[None, "Hello World!"]`` vs :exc:`OutputNotFoundError`
        verbose: If True print all the steps from which the current step
            has retrieved data.

    Returns:
        Dictionary with input data for this step. We differentiate
        between two cases:

        * Named data, which is data that was outputted with a `name` by
          any parent step. Named data can be retrieved through the
          dictionary by its name, e.g.
          ``data = get_inputs()["my_name"]``.  Name collisions will
          raise an :exc:`InputNameCollisionError`.
        * Unnamed data, which is an ordered list containing all the
          data that was outputted without a name by the parent steps.
          Unnamed data can be retrieved by accessing the reserved
          ``"unnamed"`` key. The order of this list depends on the order
          of the parent steps of the node, which is visible through the
          GUI.

        Example::

            # It does not matter how the data was output by parent
            # steps. It is resolved automatically by the get_inputs
            # method.
            {
                "unnamed" : ["Hello World!", (3, 4)],
                "named_1" : "mystring",
                "named_2" : [1, 2, 3]
            }

    Raises:
        InputNameCollisionError: Multiple steps have outputted data with
            the same name.
        OutputNotFoundError: If no output can be found of the given
            `step_uuid`. Either no output was generated or the in-memory
            object store died (and therefore lost all its data).
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine what inputs to get.

    Warning:
        Only call :meth:`get_inputs` once! When auto eviction is
        configured data might no longer be available. Either cache the
        data or maintain a copy yourself.

    """
    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)
    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine from where to get data.")

    collisions_dict = defaultdict(list)
    get_output_methods = []

    # Check for collisions before retrieving any data.
    for parent in pipeline.get_step_by_uuid(step_uuid).parents:

        # For each parent get what function to use to retrieve its
        # output data and metadata related to said data.
        parent_uuid = parent.properties["uuid"]

        try:
            get_output_method, args, kwargs, metadata = _resolve(
                parent_uuid, consumer=step_uuid)
        except error.OutputNotFoundError:
            parent_title = parent.properties["title"]
            msg = (f'Output from incoming step "{parent_title}" '
                   f'("{parent_uuid}") cannot be found. Try rerunning it.')
            raise error.OutputNotFoundError(msg)

        # Maintain the output methods in order, but wait with calling
        # them so that we can first check for collisions.
        get_output_methods.append(
            (parent, get_output_method, args, kwargs, metadata))

        if metadata["name"] != Config._RESERVED_UNNAMED_OUTPUTS_STR:
            collisions_dict[metadata["name"]].append(
                parent.properties["title"])

    # If there are collisions raise an error.
    collisions_dict = {k: v for k, v in collisions_dict.items() if len(v) > 1}
    if collisions_dict:
        msg = "".join([
            f"\n{name}: {sorted(step_names)}"
            for name, step_names in collisions_dict.items()
        ])
        raise error.InputNameCollisionError(
            f"Name collisions between input data coming from different steps: {msg}"
        )

    # TODO: maybe instead of for loop we could first get the receive
    #       method and then do batch receive. For example memory allows
    #       to do get_buffers which operates in batch.
    # NOTE: the order in which the `parents` list is traversed is
    # indirectly set in the UI. The order is important since it
    # determines the order in which unnamed inputs are received in
    # the next step.
    data = {Config._RESERVED_UNNAMED_OUTPUTS_STR: []}  # type: Dict[str, Any]
    for parent, get_output_method, args, kwargs, metadata in get_output_methods:

        # Either raise an error on failure of getting output or
        # continue with other steps.
        try:
            incoming_step_data = get_output_method(*args, **kwargs)
        except error.OutputNotFoundError as e:
            if not ignore_failure:
                raise error.OutputNotFoundError(e)

            incoming_step_data = None

        if verbose:
            parent_title = parent.properties["title"]
            if incoming_step_data is None:
                print(f'Failed to retrieve input from step: "{parent_title}"')
            else:
                print(f'Retrieved input from step: "{parent_title}"')

        # Populate the return dictionary, where nameless data gets
        # appended to a list and named data becomes a (name, data) pair.
        name = metadata["name"]
        if name == Config._RESERVED_UNNAMED_OUTPUTS_STR:
            data[Config._RESERVED_UNNAMED_OUTPUTS_STR].append(
                incoming_step_data)
        else:
            data[name] = incoming_step_data

    return data
Example #3
0
def output_to_disk(data: Any,
                   name: Optional[str],
                   serialization: Optional[Serialization] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        serialization: Serialization of the `data` in case it is already
            serialized. For possible values see :class:`Serialization`.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_disk(data, name="my_data")

    Note:
        Calling :meth:`output_to_disk` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    if name is None:
        name = Config._RESERVED_UNNAMED_OUTPUTS_STR

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = _serialize(data)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        metadata = [
            datetime.utcnow().isoformat(timespec="seconds"),
            serialization.name,
            name,
        ]
        metadata = Config.__METADATA_SEPARATOR__.join(metadata)
        f.write(metadata)

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)