Ejemplo n.º 1
0
def update_params(params: Dict[str, Any]) -> Dict[str, Any]:
    """Updates the parameters of the current step.

    Additionally, you can set new parameters by giving parameters that
    do not yet exist in the current parameters of the pipeline step.

    Internally the updating is done by calling the ``dict.update``
    method. This further explains the behavior of this method.

    Args:
        params: The parameters to update. Either updating their values
            or adding new parameter keys.

    Returns:
        The updated parameters mapping.

    """
    with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f:
        pipeline_description = json.load(f)

    pipeline = Pipeline.from_json(pipeline_description)
    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError(
            "Failed to determine from where to get data.")

    # TODO: This is inefficient, we could just use the `step_uuid` and
    #       update the params of the `pipeline_description` and write it
    #       back to the `pipeline.json`. However, I think it is good
    #       practice to use our own defined classes to do so.
    step = pipeline.get_step_by_uuid(step_uuid)
    curr_params = step.get_params()
    curr_params.update(params)

    with open(Config.PIPELINE_DESCRIPTION_PATH, "w") as f:
        json.dump(pipeline.to_dict(), f)

    return curr_params
Ejemplo n.º 2
0
def _get_current_step(pipeline: Pipeline) -> PipelineStep:
    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError("Parameters could not be identified.")
    return pipeline.get_step_by_uuid(step_uuid)
Ejemplo n.º 3
0
def get_inputs(ignore_failure: bool = False,
               verbose: bool = False) -> List[Any]:
    """Gets all data sent from incoming steps.

    Args:
        ignore_failure: If True then the returned result can have
            ``None`` values if the data of a step could not be
            retrieved. If False, then this function will fail if any of
            the incoming steps's data could not be retrieved. Example:
            ``[None, 'Hello World!']`` vs :exc:`OutputNotFoundError`
        verbose: If True print all the steps from which the current step
            has retrieved data.

    Returns:
        List of all the data in the specified order from the front-end.

        Example:

    Raises:
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine what inputs to get.

    Example:
        >>> # It does not matter how the data was output in steps 1 and 2.
        >>> # It is resolved automatically by the get_inputs method.
        >>> data_step_1, data_step_2 = get_inputs()

    Warning:
        Only call :meth:`get_inputs` once! When auto eviction is
        configured data might no longer be available. Either cache the
        data or maintain a copy yourself.

    """
    with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f:
        pipeline_description = json.load(f)

    pipeline = Pipeline.from_json(pipeline_description)
    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError(
            "Failed to determine from where to get data.")

    # TODO: maybe instead of for loop we could first get the receive
    #       method and then do batch receive. For example memory allows
    #       to do get_buffers which operates in batch.
    # NOTE: the order in which the `parents` list is traversed is
    # indirectly set in the UI. The order is important since it
    # determines the order in which the inputs are received in the next
    # step.
    data = []
    for parent in pipeline.get_step_by_uuid(step_uuid).parents:
        parent_uuid = parent.properties["uuid"]
        get_output_method, args, kwargs = resolve(parent_uuid,
                                                  consumer=step_uuid)

        # Either raise an error on failure of getting output or
        # continue with other steps.
        try:
            incoming_step_data = get_output_method(*args, **kwargs)
        except OutputNotFoundError as e:
            if not ignore_failure:
                raise OutputNotFoundError(e)

            incoming_step_data = None

        if verbose:
            parent_title = parent.properties["title"]
            if incoming_step_data is None:
                print(f'Failed to retrieve input from step: "{parent_title}"')
            else:
                print(f'Retrieved input from step: "{parent_title}"')

        data.append(incoming_step_data)

    return data
Ejemplo n.º 4
0
def output_to_memory(data: Any,
                     pickle_fallback: bool = True,
                     disk_fallback: bool = True) -> None:
    """Outputs data to memory.

    To manage outputing the data to memory for the user, this function
    uses metadata to add info to objects inside the plasma store.

    Args:
        data: Data to output.
        pickle_fallback: This option is passed to :meth:`serialize`. If
            ``pyarrow`` cannot serialize the data, then it will fall
            back to using ``pickle``. This is helpful for custom data
            types.
        disk_fallback: If True, then outputing to disk is used when the
            `data` does not fit in memory. If False, then a
            :exc:`MemoryError` is thrown.

    Raises:
        MemoryError: If the `data` does not fit in memory and
            ``disk_fallback=False``.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist. Which
            might be because the specified value was wrong or the store
            died.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot set the correct ID to identify the data in
            the memory store.

    Example:
        >>> data = 'Data I would like to use in my next step'
        >>> output_to_memory(data)

    Note:
        Calling :meth:`output_to_memory` multiple times within the same
        script will overwrite the output. Generally speaking you
        therefore want to be only calling the function once.

    """
    # TODO: we might want to wrap this so we can throw a custom error,
    #       if the file cannot be found, i.e. FileNotFoundError.
    with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f:
        pipeline_description = json.load(f)

    pipeline = Pipeline.from_json(pipeline_description)

    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # Serialize the object and collect the serialization metadata.
    obj, serialization = serialize(data, pickle_fallback=pickle_fallback)

    try:
        client = _PlasmaConnector().client
    except OrchestNetworkError as e:
        if not disk_fallback:
            raise OrchestNetworkError(e)

        # TODO: note that metadata is lost when falling back to disk.
        #       Therefore we will only support metadata added by the
        #       user, once disk also supports passing metadata.
        return output_to_disk(obj, serialization=serialization)

    # Try to output to memory.
    obj_id = _convert_uuid_to_object_id(step_uuid)
    metadata = bytes(f"{Config.IDENTIFIER_SERIALIZATION};{serialization}",
                     "utf-8")

    try:
        obj_id = _output_to_memory(obj,
                                   client,
                                   obj_id=obj_id,
                                   metadata=metadata)

    except MemoryError:
        if not disk_fallback:
            raise MemoryError("Data does not fit in memory.")

        # TODO: note that metadata is lost when falling back to disk.
        #       Therefore we will only support metadata added by the
        #       user, once disk also supports passing metadata.
        return output_to_disk(obj, serialization=serialization)

    return
Ejemplo n.º 5
0
def output_to_disk(data: Any,
                   pickle_fallback: bool = True,
                   serialization: Optional[str] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        pickle_fallback: This option is passed to :meth:`serialize`. If
            ``pyarrow`` cannot serialize the data, then it will fall
            back to using ``pickle``. This is helpful for custom data
            types.
        serialization: Serialization of the `data` in case it is already
            serialized. Currently supported values are:
            ``['arrow', 'arrowpickle']``.

    Raises:
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = 'Data I would like to use in my next step'
        >>> output_to_disk(data)

    Note:
        Calling :meth:`output_to_disk` multiple times within the same script
        will overwrite the output. Generally speaking you therefore want
        to be only calling the function once.

    """
    with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f:
        pipeline_description = json.load(f)

    pipeline = Pipeline.from_json(pipeline_description)

    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = serialize(data, pickle_fallback=pickle_fallback)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        current_time = datetime.utcnow()
        f.write(
            f'{current_time.isoformat(timespec="seconds")}, {serialization}')

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)
Ejemplo n.º 6
0
def get_inputs(ignore_failure: bool = False,
               verbose: bool = False) -> Dict[str, Any]:
    """Gets all data sent from incoming steps.

    Args:
        ignore_failure: If True then the returned result can have
            ``None`` values if the data of a step could not be
            retrieved. If False, then this function will fail if any of
            the incoming steps's data could not be retrieved. Example:
            ``[None, "Hello World!"]`` vs :exc:`OutputNotFoundError`
        verbose: If True print all the steps from which the current step
            has retrieved data.

    Returns:
        Dictionary with input data for this step. We differentiate
        between two cases:

        * Named data, which is data that was outputted with a `name` by
          any parent step. Named data can be retrieved through the
          dictionary by its name, e.g.
          ``data = get_inputs()["my_name"]``.  Name collisions will
          raise an :exc:`InputNameCollisionError`.
        * Unnamed data, which is an ordered list containing all the
          data that was outputted without a name by the parent steps.
          Unnamed data can be retrieved by accessing the reserved
          ``"unnamed"`` key. The order of this list depends on the order
          of the parent steps of the node, which is visible through the
          GUI.

        Example::

            # It does not matter how the data was output by parent
            # steps. It is resolved automatically by the get_inputs
            # method.
            {
                "unnamed" : ["Hello World!", (3, 4)],
                "named_1" : "mystring",
                "named_2" : [1, 2, 3]
            }

    Raises:
        InputNameCollisionError: Multiple steps have outputted data with
            the same name.
        OutputNotFoundError: If no output can be found of the given
            `step_uuid`. Either no output was generated or the in-memory
            object store died (and therefore lost all its data).
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine what inputs to get.

    Warning:
        Only call :meth:`get_inputs` once! When auto eviction is
        configured data might no longer be available. Either cache the
        data or maintain a copy yourself.

    """
    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)
    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine from where to get data.")

    collisions_dict = defaultdict(list)
    get_output_methods = []

    # Check for collisions before retrieving any data.
    for parent in pipeline.get_step_by_uuid(step_uuid).parents:

        # For each parent get what function to use to retrieve its
        # output data and metadata related to said data.
        parent_uuid = parent.properties["uuid"]

        try:
            get_output_method, args, kwargs, metadata = _resolve(
                parent_uuid, consumer=step_uuid)
        except error.OutputNotFoundError:
            parent_title = parent.properties["title"]
            msg = (f'Output from incoming step "{parent_title}" '
                   f'("{parent_uuid}") cannot be found. Try rerunning it.')
            raise error.OutputNotFoundError(msg)

        # Maintain the output methods in order, but wait with calling
        # them so that we can first check for collisions.
        get_output_methods.append(
            (parent, get_output_method, args, kwargs, metadata))

        if metadata["name"] != Config._RESERVED_UNNAMED_OUTPUTS_STR:
            collisions_dict[metadata["name"]].append(
                parent.properties["title"])

    # If there are collisions raise an error.
    collisions_dict = {k: v for k, v in collisions_dict.items() if len(v) > 1}
    if collisions_dict:
        msg = "".join([
            f"\n{name}: {sorted(step_names)}"
            for name, step_names in collisions_dict.items()
        ])
        raise error.InputNameCollisionError(
            f"Name collisions between input data coming from different steps: {msg}"
        )

    # TODO: maybe instead of for loop we could first get the receive
    #       method and then do batch receive. For example memory allows
    #       to do get_buffers which operates in batch.
    # NOTE: the order in which the `parents` list is traversed is
    # indirectly set in the UI. The order is important since it
    # determines the order in which unnamed inputs are received in
    # the next step.
    data = {Config._RESERVED_UNNAMED_OUTPUTS_STR: []}  # type: Dict[str, Any]
    for parent, get_output_method, args, kwargs, metadata in get_output_methods:

        # Either raise an error on failure of getting output or
        # continue with other steps.
        try:
            incoming_step_data = get_output_method(*args, **kwargs)
        except error.OutputNotFoundError as e:
            if not ignore_failure:
                raise error.OutputNotFoundError(e)

            incoming_step_data = None

        if verbose:
            parent_title = parent.properties["title"]
            if incoming_step_data is None:
                print(f'Failed to retrieve input from step: "{parent_title}"')
            else:
                print(f'Retrieved input from step: "{parent_title}"')

        # Populate the return dictionary, where nameless data gets
        # appended to a list and named data becomes a (name, data) pair.
        name = metadata["name"]
        if name == Config._RESERVED_UNNAMED_OUTPUTS_STR:
            data[Config._RESERVED_UNNAMED_OUTPUTS_STR].append(
                incoming_step_data)
        else:
            data[name] = incoming_step_data

    return data
Ejemplo n.º 7
0
def output_to_memory(data: Any,
                     name: Optional[str],
                     disk_fallback: bool = True) -> None:
    """Outputs data to memory.

    To manage outputing the data to memory for the user, this function
    uses metadata to add info to objects inside the plasma store.

    Args:
        data: Data to output.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        disk_fallback: If True, then outputing to disk is used when the
            `data` does not fit in memory. If False, then a
            :exc:`MemoryError` is thrown.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        MemoryError: If the `data` does not fit in memory and
            ``disk_fallback=False``.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist.
            Which might be because the specified value was wrong or the
            store died.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot set the correct ID to identify the data in
            the memory store.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_memory(data, name="my_data")

    Note:
        Calling :meth:`output_to_memory` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # Serialize the object and collect the serialization metadata.
    obj, serialization = _serialize(data)

    try:
        client = _PlasmaConnector().client
    except error.OrchestNetworkError as e:
        if not disk_fallback:
            raise error.OrchestNetworkError(e)

        return output_to_disk(obj, name, serialization=serialization)

    # Try to output to memory.
    obj_id = _convert_uuid_to_object_id(step_uuid)
    metadata = [
        str(Config.IDENTIFIER_SERIALIZATION),
        # The plasma store allows to get the creation timestamp, but
        # creating it this way makes the process more consistent with
        # the metadata we are writing when outputting to disk, moreover,
        # it makes the code less dependent on the plasma store API.
        datetime.utcnow().isoformat(timespec="seconds"),
        serialization.name,
        # Can't simply assign to name beforehand because name might be
        # passed to output_to_disk, which needs to check for name
        # validity itself since its a public function.
        name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR,
    ]
    metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8")

    try:
        obj_id = _output_to_memory(obj,
                                   client,
                                   obj_id=obj_id,
                                   metadata=metadata)

    except MemoryError:
        if not disk_fallback:
            raise MemoryError("Data does not fit in memory.")

        # TODO: note that metadata is lost when falling back to disk.
        #       Therefore we will only support metadata added by the
        #       user, once disk also supports passing metadata.
        return output_to_disk(obj, name, serialization=serialization)

    return
Ejemplo n.º 8
0
def output_to_disk(data: Any,
                   name: Optional[str],
                   serialization: Optional[Serialization] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        serialization: Serialization of the `data` in case it is already
            serialized. For possible values see :class:`Serialization`.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_disk(data, name="my_data")

    Note:
        Calling :meth:`output_to_disk` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    if name is None:
        name = Config._RESERVED_UNNAMED_OUTPUTS_STR

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = _serialize(data)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        metadata = [
            datetime.utcnow().isoformat(timespec="seconds"),
            serialization.name,
            name,
        ]
        metadata = Config.__METADATA_SEPARATOR__.join(metadata)
        f.write(metadata)

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)