コード例 #1
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def _get_output_disk(step_uuid: str, serialization: str) -> Any:
    """Gets data from disk.

    Args:
        step_uuid: The UUID of the step to get output data from.
        serialization: The serialization for the output. For possible
            values see :class:`Serialization`.

    Returns:
        Data from the step identified by `step_uuid`.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be
            found.
        DeserializationError: If the data could not be deserialized.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    full_path = os.path.join(step_data_dir, step_uuid)

    try:
        return _deserialize_output_disk(full_path, serialization=serialization)
    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise error.DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")
    # IOError is to try to catch pyarrow failures on opening the file.
    except (pickle.UnpicklingError, IOError):
        raise error.DeserializationError(
            f'Output from incoming step "{step_uuid}" ({full_path}) '
            "could not be deserialized.")
コード例 #2
0
ファイル: transfer.py プロジェクト: fruttasecca/orchest
def get_output_disk(step_uuid: str, serialization: str = "arrow") -> Any:
    """Gets data from disk.

    Args:
        step_uuid: The UUID of the step to get output data from.
        serialization: The serialization of the output. Has to be
            specified in order to deserialize correctly.

    Returns:
        Data from the step identified by `step_uuid`.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be found.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    full_path = os.path.join(step_data_dir, step_uuid)

    try:
        return _get_output_disk(full_path, serialization=serialization)
    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")
コード例 #3
0
ファイル: transfer.py プロジェクト: VivanVatsa/orchest
def _get_output_disk(step_uuid: str, serialization: str) -> Any:
    """Gets data from disk.

    Args:
        step_uuid: The UUID of the step to get output data from.
        serialization: The serialization for the output. For possible
            values see :class:`Serialization`.

    Returns:
        Data from the step identified by `step_uuid`.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be found.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    full_path = os.path.join(step_data_dir, step_uuid)

    try:
        return _deserialize_output_disk(full_path, serialization=serialization)
    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")
コード例 #4
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def _resolve_disk(step_uuid: str) -> Dict[str, Any]:
    """Returns information of the most recent write to disk.

    Resolves via the HEAD file the timestamp (that is used to determine
    the most recent write) and arguments to call the
    :meth:`get_output_disk` method.

    Args:
        step_uuid: The UUID of the step to resolve its most recent write
            to disk.

    Returns:
        Dictionary containing the information of the function to be
        called to get the most recent data from the step. Additionally,
        returns fill-in arguments for the function and metadata related
        to the data that would be retrieved.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be
            found.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    head_file = os.path.join(step_data_dir, "HEAD")

    try:
        with open(head_file, "r") as f:
            timestamp, serialization, name = _interpret_metadata(f.read())

    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise error.DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")

    res = {
        "method_to_call": _get_output_disk,
        "method_args": (step_uuid, ),
        "method_kwargs": {
            "serialization": serialization
        },
        "metadata": {
            "timestamp": timestamp,
            "serialization": serialization,
            "name": name,
        },
    }
    return res
コード例 #5
0
ファイル: transfer.py プロジェクト: zkan/orchest
def resolve_disk(step_uuid: str) -> Dict[str, Any]:
    """Returns information of the most recent write to disk.

    Resolves via the HEAD file the timestamp (that is used to determine
    the most recent write) and arguments to call the
    :meth:`get_output_disk` method.

    Args:
        step_uuid: The UUID of the step to resolve its most recent write
            to disk.

    Returns:
        Dictionary containing the information of the function to be
        called to get the most recent data from the step. Additionally,
        returns fill-in arguments for the function.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be found.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    head_file = os.path.join(step_data_dir, 'HEAD')

    try:
        with open(head_file, 'r') as f:
            timestamp, serialization = f.read().split(', ')

    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            'Try rerunning it.')

    res = {
        'timestamp': timestamp,
        'method_to_call': get_output_disk,
        'method_args': (step_uuid, ),
        'method_kwargs': {
            'serialization': serialization
        }
    }
    return res
コード例 #6
0
ファイル: transfer.py プロジェクト: fruttasecca/orchest
def output_to_disk(data: Any,
                   pickle_fallback: bool = True,
                   serialization: Optional[str] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        pickle_fallback: This option is passed to :meth:`serialize`. If
            ``pyarrow`` cannot serialize the data, then it will fall
            back to using ``pickle``. This is helpful for custom data
            types.
        serialization: Serialization of the `data` in case it is already
            serialized. Currently supported values are:
            ``['arrow', 'arrowpickle']``.

    Raises:
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = 'Data I would like to use in my next step'
        >>> output_to_disk(data)

    Note:
        Calling :meth:`output_to_disk` multiple times within the same script
        will overwrite the output. Generally speaking you therefore want
        to be only calling the function once.

    """
    with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f:
        pipeline_description = json.load(f)

    pipeline = Pipeline.from_json(pipeline_description)

    try:
        step_uuid = get_step_uuid(pipeline)
    except StepUUIDResolveError:
        raise StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = serialize(data, pickle_fallback=pickle_fallback)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        current_time = datetime.utcnow()
        f.write(
            f'{current_time.isoformat(timespec="seconds")}, {serialization}')

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)
コード例 #7
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def output_to_disk(data: Any,
                   name: Optional[str],
                   serialization: Optional[Serialization] = None) -> None:
    """Outputs data to disk.

    To manage outputing the data to disk, this function has a side
    effect:

    * Writes to a HEAD file alongside the actual data file. This file
      serves as a protocol that returns the timestamp of the latest
      write to disk via this function alongside the used serialization.

    Args:
        data: Data to output to disk.
        name: Name of the output data. As a string, it becomes the name
            of the data, when ``None``, the data is considered nameless.
            This affects the way the data can be later retrieved using
            :func:`get_inputs`.
        serialization: Serialization of the `data` in case it is already
            serialized. For possible values see :class:`Serialization`.

    Raises:
        DataInvalidNameError: The name of the output data is invalid,
            e.g because it is a reserved name (``"unnamed"``) or because
            it contains a reserved substring.
        PipelineDefinitionNotFoundError: If the pipeline definition file
            could not be found.
        StepUUIDResolveError: The step's UUID cannot be resolved and
            thus it cannot determine where to output data to.

    Example:
        >>> data = "Data I would like to use in my next step"
        >>> output_to_disk(data, name="my_data")

    Note:
        Calling :meth:`output_to_disk` multiple times within the same
        script will overwrite the output, even when using a different
        output ``name``. You therefore want to be only calling the
        function once.

    """
    try:
        _check_data_name_validity(name)
    except (ValueError, TypeError) as e:
        raise error.DataInvalidNameError(e)

    if name is None:
        name = Config._RESERVED_UNNAMED_OUTPUTS_STR

    try:
        with open(Config.PIPELINE_DEFINITION_PATH, "r") as f:
            pipeline_definition = json.load(f)
    except FileNotFoundError:
        raise error.PipelineDefinitionNotFoundError(
            f"Could not open {Config.PIPELINE_DEFINITION_PATH}.")

    pipeline = Pipeline.from_json(pipeline_definition)

    try:
        step_uuid = get_step_uuid(pipeline)
    except error.StepUUIDResolveError:
        raise error.StepUUIDResolveError(
            "Failed to determine where to output data to.")

    # In case the data is not already serialized, then we need to
    # serialize it.
    if serialization is None:
        data, serialization = _serialize(data)

    # Recursively create any directories if they do not already exists.
    step_data_dir = Config.get_step_data_dir(step_uuid)
    os.makedirs(step_data_dir, exist_ok=True)

    # The HEAD file serves to resolve the transfer method.
    head_file = os.path.join(step_data_dir, "HEAD")
    with open(head_file, "w") as f:
        metadata = [
            datetime.utcnow().isoformat(timespec="seconds"),
            serialization.name,
            name,
        ]
        metadata = Config.__METADATA_SEPARATOR__.join(metadata)
        f.write(metadata)

    # Full path to write the actual data to.
    full_path = os.path.join(step_data_dir, step_uuid)

    return _output_to_disk(data, full_path, serialization=serialization)