コード例 #1
0
def output(
    data: Any,
    name: Optional[str],
) -> None:
    """Outputs data so that it can be retrieved by the next step.

    Note:
        Calling :meth:`output` multiple times within the same step
        will overwrite the output, even when using a different output
        ``name``. You therefore want to be only calling the function
        once.

    Args:
        data: Data to output.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist.
            Which might be because the specified value was wrong or the
            store died.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus data cannot be outputted.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output(data, name="my_data")
    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    return output_to_disk(
        data,
        name,
    )
コード例 #2
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def output_to_memory(data: Any,
                     name: Optional[str],
                     disk_fallback: bool = True) -> None:
    """Outputs data to memory.

    To manage outputing the data to memory for the user, this function
    uses metadata to add info to objects inside the plasma store.

    Args:
        data: Data to output.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        disk_fallback: If True, then outputing to disk is used when the
            `data` does not fit in memory. If False, then a
            :exc:`MemoryError` is thrown.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        MemoryError: If the `data` does not fit in memory and
            ``disk_fallback=False``.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist.
            Which might be because the specified value was wrong or the
            store died.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot set the correct ID to identify the data in
            the memory store.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_memory(data, name="my_data")

    Note:
        Calling :meth:`output_to_memory` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # Serialize the object and collect the serialization metadata.
    obj, serialization = _serialize(data)

    try:
        client = _PlasmaConnector().client
    except error.OrchestNetworkError as e:
        if not disk_fallback:
            raise error.OrchestNetworkError(e)

        return output_to_disk(obj, name, serialization=serialization)

    # Try to output to memory.
    obj_id = _convert_uuid_to_object_id(step_uuid)
    metadata = [
        str(Config.IDENTIFIER_SERIALIZATION),
        # The plasma store allows to get the creation timestamp, but
        # creating it this way makes the process more consistent with
        # the metadata we are writing when outputting to disk, moreover,
        # it makes the code less dependent on the plasma store API.
        datetime.utcnow().isoformat(timespec="seconds"),
        serialization.name,
        # Can't simply assign to name beforehand because name might be
        # passed to output_to_disk, which needs to check for name
        # validity itself since its a public function.
        name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR,
    ]
    metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8")

    try:
        obj_id = _output_to_memory(obj,
                                   client,
                                   obj_id=obj_id,
                                   metadata=metadata)

    except MemoryError:
        if not disk_fallback:
            raise MemoryError("Data does not fit in memory.")

        # TODO: note that metadata is lost when falling back to disk.
        #       Therefore we will only support metadata added by the
        #       user, once disk also supports passing metadata.
        return output_to_disk(obj, name, serialization=serialization)

    return
コード例 #3
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def output_to_disk(data: Any,
                   name: Optional[str],
                   serialization: Optional[Serialization] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        serialization: Serialization of the `data` in case it is already
            serialized. For possible values see :class:`Serialization`.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_disk(data, name="my_data")

    Note:
        Calling :meth:`output_to_disk` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    if name is None:
        name = Config._RESERVED_UNNAMED_OUTPUTS_STR

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = _serialize(data)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        metadata = [
            datetime.utcnow().isoformat(timespec="seconds"),
            serialization.name,
            name,
        ]
        metadata = Config.__METADATA_SEPARATOR__.join(metadata)
        f.write(metadata)

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)