Exemple #1
0
    def __init__(self,
                 examples: Channel = None,
                 schema: Channel = None,
                 module_file: Optional[ProcessingFn] = None,
                 preprocessing_fn: Optional[ProcessingFn] = None,
                 transform_graph: Optional[Channel] = None,
                 split_names: Optional[List[Text]] = None,
                 input_data: Optional[Channel] = None,
                 instance_name: Optional[Text] = None):
        if bool(module_file) == bool(preprocessing_fn):
            raise ValueError(
                'Exactly one of `module_file` or `preprocessing_fn`'
                ' must be supplied.')

        split_names = splits_or_example_defaults(split_names)
        transform_artifact = standard_artifacts.Examples()
        transform_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        transform_graph = transform_graph or Channel(
            type=standard_artifacts.TransformGraph,
            artifacts=[standard_artifacts.TransformGraph()])

        transformed_examples = Channel(type=standard_artifacts.Examples,
                                       artifacts=[transform_artifact])

        spec = TransformSpec(examples=examples,
                             schema=schema,
                             module_file=module_file,
                             preprocessing_fn=preprocessing_fn,
                             transform_graph=transform_graph,
                             transformed_examples=transformed_examples)
        super(Transform, self).__init__(spec=spec, instance_name=instance_name)
Exemple #2
0
    def __init__(
            self,
            examples: types.Channel = None,
            schema: types.Channel = None,
            module_file: Optional[Union[Text,
                                        data_types.RuntimeParameter]] = None,
            preprocessing_fn: Optional[Union[
                Text, data_types.RuntimeParameter]] = None,
            transform_graph: Optional[types.Channel] = None,
            transformed_examples: Optional[types.Channel] = None,
            input_data: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None,
            materialize: bool = True):
        """Construct a Transform component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples` (required).
        This should contain the two splits 'train' and 'eval'.
      schema: A Channel of type `standard_artifacts.Schema`. This should
        contain a single schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded. The function must have the
        following signature.

        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...

        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.  Exactly one of 'module_file' or 'preprocessing_fn'
        must be supplied.
      preprocessing_fn: The path to python function that implements a
        'preprocessing_fn'. See 'module_file' for expected signature of the
        function. Exactly one of 'module_file' or 'preprocessing_fn' must be
        supplied.
      transform_graph: Optional output 'TransformPath' channel for output of
        'tf.Transform', which includes an exported Tensorflow graph suitable for
        both training and serving;
      transformed_examples: Optional output 'ExamplesPath' channel for
        materialized transformed examples, which includes both 'train' and
        'eval' splits.
      input_data: Backwards compatibility alias for the 'examples' argument.
      instance_name: Optional unique instance name. Necessary iff multiple
        transform components are declared in the same pipeline.
      materialize: If True, write transformed examples as an output. If False,
        `transformed_examples` must not be provided.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
        if input_data:
            absl.logging.warning(
                'The "input_data" argument to the Transform component has '
                'been renamed to "examples" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            examples = input_data
        if bool(module_file) == bool(preprocessing_fn):
            raise ValueError(
                "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied."
            )

        transform_graph = transform_graph or types.Channel(
            type=standard_artifacts.TransformGraph,
            artifacts=[standard_artifacts.TransformGraph()])
        if materialize and transformed_examples is None:
            transformed_examples = types.Channel(
                type=standard_artifacts.Examples,
                # TODO(b/161548528): remove the hardcode artifact.
                artifacts=[standard_artifacts.Examples()],
                matching_channel_name='examples')
        elif not materialize and transformed_examples is not None:
            raise ValueError(
                'must not specify transformed_examples when materialize==False'
            )
        spec = TransformSpec(examples=examples,
                             schema=schema,
                             module_file=module_file,
                             preprocessing_fn=preprocessing_fn,
                             transform_graph=transform_graph,
                             transformed_examples=transformed_examples)
        super(Transform, self).__init__(spec=spec, instance_name=instance_name)
Exemple #3
0
  def __init__(
      self,
      examples: types.Channel = None,
      schema: types.Channel = None,
      module_file: Optional[Union[Text, data_types.RuntimeParameter]] = None,
      preprocessing_fn: Optional[Union[Text,
                                       data_types.RuntimeParameter]] = None,
      splits_config: transform_pb2.SplitsConfig = None,
      transform_graph: Optional[types.Channel] = None,
      transformed_examples: Optional[types.Channel] = None,
      input_data: Optional[types.Channel] = None,
      analyzer_cache: Optional[types.Channel] = None,
      instance_name: Optional[Text] = None,
      materialize: bool = True,
      disable_analyzer_cache: bool = False,
      force_tf_compat_v1: bool = True,
      custom_config: Optional[Dict[Text, Any]] = None):
    """Construct a Transform component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples` (required).
        This should contain custom splits specified in splits_config. If
        custom split is not provided, this should contain two splits 'train'
        and 'eval'.
      schema: A Channel of type `standard_artifacts.Schema`. This should
        contain a single schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded.
        Exactly one of 'module_file' or 'preprocessing_fn' must be supplied.

        The function needs to have the following signature:
        ```
        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.

        If additional inputs are needed for preprocessing_fn, they can be passed
        in custom_config:

        ```
        def preprocessing_fn(inputs: Dict[Text, Any], custom_config:
                             Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
      preprocessing_fn: The path to python function that implements a
        'preprocessing_fn'. See 'module_file' for expected signature of the
        function. Exactly one of 'module_file' or 'preprocessing_fn' must be
        supplied.
      splits_config: A transform_pb2.SplitsConfig instance, providing splits
        that should be analyzed and splits that should be transformed. Note
        analyze and transform splits can have overlap. Default behavior (when
        splits_config is not set) is analyze the 'train' split and transform
        all splits. If splits_config is set, analyze cannot be empty.
      transform_graph: Optional output 'TransformPath' channel for output of
        'tf.Transform', which includes an exported Tensorflow graph suitable for
        both training and serving;
      transformed_examples: Optional output 'ExamplesPath' channel for
        materialized transformed examples, which includes transform splits as
        specified in splits_config. If custom split is not provided, this should
        include both 'train' and 'eval' splits.
      input_data: Backwards compatibility alias for the 'examples' argument.
      analyzer_cache: Optional input 'TransformCache' channel containing
        cached information from previous Transform runs. When provided,
        Transform will try use the cached calculation if possible.
      instance_name: Optional unique instance name. Necessary iff multiple
        transform components are declared in the same pipeline.
      materialize: If True, write transformed examples as an output. If False,
        `transformed_examples` must not be provided.
      disable_analyzer_cache: If False, Transform will use input cache if
        provided and write cache output. If True, `analyzer_cache` must not be
        provided.
      force_tf_compat_v1: (Optional) If True, Transform will use Tensorflow in
        compat.v1 mode irrespective of installed version of Tensorflow. Defaults
        to `True`. Note: The default value will be switched to `False` in a
        future release.
      custom_config: A dict which contains additional parameters that will be
        passed to preprocessing_fn.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
    if input_data:
      absl.logging.warning(
          'The "input_data" argument to the Transform component has '
          'been renamed to "examples" and is deprecated. Please update your '
          'usage as support for this argument will be removed soon.')
      examples = input_data
    if bool(module_file) == bool(preprocessing_fn):
      raise ValueError(
          "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied."
      )

    transform_graph = transform_graph or types.Channel(
        type=standard_artifacts.TransformGraph)

    if materialize and transformed_examples is None:
      transformed_examples = types.Channel(
          type=standard_artifacts.Examples,
          matching_channel_name='examples')
    elif not materialize and transformed_examples is not None:
      raise ValueError(
          'Must not specify transformed_examples when materialize is False.')

    if disable_analyzer_cache:
      updated_analyzer_cache = None
      if analyzer_cache:
        raise ValueError(
            '`analyzer_cache` is set when disable_analyzer_cache is True.')
    else:
      updated_analyzer_cache = types.Channel(
          type=standard_artifacts.TransformCache)

    spec = TransformSpec(
        examples=examples,
        schema=schema,
        module_file=module_file,
        preprocessing_fn=preprocessing_fn,
        force_tf_compat_v1=int(force_tf_compat_v1),
        splits_config=splits_config,
        transform_graph=transform_graph,
        transformed_examples=transformed_examples,
        analyzer_cache=analyzer_cache,
        updated_analyzer_cache=updated_analyzer_cache,
        custom_config=json_utils.dumps(custom_config))
    super(Transform, self).__init__(spec=spec, instance_name=instance_name)
Exemple #4
0
    def __init__(self,
                 input_data: types.Channel = None,
                 schema: types.Channel = None,
                 module_file: Optional[Text] = None,
                 preprocessing_fn: Optional[Text] = None,
                 transform_output: Optional[types.Channel] = None,
                 transformed_examples: Optional[types.Channel] = None,
                 examples: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a Transform component.

    Args:
      input_data: A Channel of 'ExamplesPath' type (required). This should
        contain the two splits 'train' and 'eval'.
      schema: A Channel of 'SchemaPath' type. This should contain a single
        schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded. The function must have the
        following signature.

        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...

        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.  Exactly one of 'module_file' or 'preprocessing_fn'
        must be supplied.
      preprocessing_fn: The path to python function that implements a
         'preprocessing_fn'. See 'module_file' for expected signature of the
         function. Exactly one of 'module_file' or 'preprocessing_fn' must
         be supplied.
      transform_output: Optional output 'TransformPath' channel for output of
        'tf.Transform', which includes an exported Tensorflow graph suitable for
        both training and serving;
      transformed_examples: Optional output 'ExamplesPath' channel for
        materialized transformed examples, which includes both 'train' and
        'eval' splits.
      examples: Forwards compatibility alias for the 'input_data' argument.
      instance_name: Optional unique instance name. Necessary iff multiple
        transform components are declared in the same pipeline.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
        input_data = input_data or examples
        if bool(module_file) == bool(preprocessing_fn):
            raise ValueError(
                "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied."
            )

        transform_output = transform_output or types.Channel(
            type=standard_artifacts.TransformGraph,
            artifacts=[standard_artifacts.TransformGraph()])
        transformed_examples = transformed_examples or types.Channel(
            type=standard_artifacts.Examples,
            artifacts=[
                standard_artifacts.Examples(split=split)
                for split in artifact.DEFAULT_EXAMPLE_SPLITS
            ])
        spec = TransformSpec(input_data=input_data,
                             schema=schema,
                             module_file=module_file,
                             preprocessing_fn=preprocessing_fn,
                             transform_output=transform_output,
                             transformed_examples=transformed_examples)
        super(Transform, self).__init__(spec=spec, instance_name=instance_name)