Exemple #1
0
    def testMakeDefaultOutputConfig(self):
        output_config = utils.make_default_output_config(
            utils.make_default_input_config())
        self.assertEqual(2, len(output_config.split_config.splits))

        output_config = utils.make_default_output_config(
            example_gen_pb2.Input(splits=[
                example_gen_pb2.Input.Split(name='train', pattern='train/*'),
                example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
            ]))
        self.assertEqual(0, len(output_config.split_config.splits))
Exemple #2
0
 def __init__(self,
              executor: Any,
              input_base: Optional[channel.Channel] = None,
              input_config: Optional[example_gen_pb2.Input] = None,
              output_config: Optional[example_gen_pb2.Output] = None,
              component_name: Optional[Text] = 'ExampleGen',
              unique_name: Optional[Text] = None,
              outputs: Optional[base_component.ComponentOutputs] = None):
     if input_base is None and input_config is None:
         raise RuntimeError(
             'One of input_base and input_config must be set.')
     input_dict = {
         'input-base': channel.as_channel(input_base)
     } if input_base else {}
     # Default value need to be set in component instead of executor as output
     # artifacts depend on it.
     self._input_config = input_config or utils.make_default_input_config()
     self._output_config = output_config or utils.make_default_output_config(
         self._input_config)
     exec_properties = {
         'input': json_format.MessageToJson(self._input_config),
         'output': json_format.MessageToJson(self._output_config)
     }
     super(ExampleGen, self).__init__(
         component_name=component_name,
         unique_name=unique_name,
         driver=driver.Driver if input_base else base_driver.BaseDriver,
         executor=executor,
         input_dict=input_dict,
         outputs=outputs,
         exec_properties=exec_properties)
Exemple #3
0
    def __init__(
        self,
        input_config: Union[example_gen_pb2.Input,
                            data_types.RuntimeParameter],
        output_config: Optional[Union[example_gen_pb2.Output,
                                      data_types.RuntimeParameter]] = None,
        custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                      data_types.RuntimeParameter]] = None,
        range_config: Optional[Union[range_config_pb2.RangeConfig,
                                     data_types.RuntimeParameter]] = None,
        output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE,
        output_file_format: Optional[int] = example_gen_pb2.
        FORMAT_TFRECORDS_GZIP,
    ):
        """Construct a QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing input configuration. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing custom configuration for ExampleGen.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      output_file_format: File format of generated data in output artifact,
          one of example_gen_pb2.FileFormat enum.

    Raises:
      ValueError: The output_data_format, output_file_format value
        must be defined in the example_gen_pb2.PayloadFormat proto.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = types.Channel(type=standard_artifacts.Examples)
        if output_data_format not in example_gen_pb2.PayloadFormat.values():
            raise ValueError(
                'The value of output_data_format must be defined in'
                'the example_gen_pb2.PayloadFormat proto.')
        if output_file_format not in example_gen_pb2.FileFormat.values():
            raise ValueError(
                'The value of output_file_format must be defined in'
                'the example_gen_pb2.FileFormat proto.')

        spec = standard_component_specs.QueryBasedExampleGenSpec(
            input_config=input_config,
            output_config=output_config,
            range_config=range_config,
            output_data_format=output_data_format,
            output_file_format=output_file_format,
            custom_config=custom_config,
            examples=example_artifacts)
        super().__init__(spec=spec)
Exemple #4
0
    def __init__(self,
                 input_config: example_gen_pb2.Input,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An example_gen_pb2.Input instance, providing input
        configuration.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = QueryBasedExampleGenSpec(component_name=component_name,
                                        input_config=input_config,
                                        output_config=output_config,
                                        examples=example_artifacts)
        super(_QueryBasedExampleGen, self).__init__(spec=spec, name=name)
Exemple #5
0
    def __init__(
            self,
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       range_config=range_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
Exemple #6
0
    def __init__(self,
                 input_config: Union[example_gen_pb2.Input, Dict[Text, Any]],
                 output_config: Optional[Union[example_gen_pb2.Output,
                                               Dict[Text, Any]]] = None,
                 custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                               Dict[Text, Any]]] = None,
                 output_data_format: Optional[int] = example_gen_pb2.
                 FORMAT_TF_EXAMPLE,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If any field is provided as a
          RuntimeParameter, input_config should be constructed as a dict with
          the same field names as Input proto message. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
          If any field is provided as a RuntimeParameter, output_config should
          be constructed as a dict with the same field names as Output proto
          message.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing custom configuration for ExampleGen. If any field
          is provided as a RuntimeParameter, output_config should be constructed
          as a dict.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of `standard_artifacts.Examples` for output
        train and eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.

    Raises:
      ValueError: The output_data_format value must be defined in the
        example_gen_pb2.PayloadFormat proto.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        if output_data_format not in example_gen_pb2.PayloadFormat.values():
            raise ValueError(
                'The value of output_data_format must be defined in'
                'the example_gen_pb2.PayloadFormat proto.')

        spec = QueryBasedExampleGenSpec(input_config=input_config,
                                        output_config=output_config,
                                        output_data_format=output_data_format,
                                        custom_config=custom_config,
                                        examples=example_artifacts)
        super(QueryBasedExampleGen, self).__init__(spec=spec,
                                                   instance_name=instance_name)
Exemple #7
0
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing
        the data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
    """
        input = input or input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=str(split_name))
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
Exemple #8
0
    def __init__(
            self,
            input_base: Optional[str] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         data_types.RuntimeParameter]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          data_types.RuntimeParameter]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          data_types.RuntimeParameter]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         data_types.RuntimeParameter]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            output_file_format: Optional[int] = example_gen_pb2.
        FORMAT_TFRECORDS_GZIP,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      output_file_format: File format of generated data in output artifact,
        one of example_gen_pb2.FileFormat enum.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = standard_component_specs.FileBasedExampleGenSpec(
            input_base=input_base,
            input_config=input_config,
            output_config=output_config,
            custom_config=custom_config,
            range_config=range_config,
            output_data_format=output_data_format,
            output_file_format=output_file_format,
            examples=example_artifacts)
        super().__init__(spec=spec, custom_executor_spec=custom_executor_spec)
Exemple #9
0
    def __init__(self,
                 input_config: Union[example_gen_pb2.Input, Dict[Text, Any]],
                 output_config: Optional[Union[example_gen_pb2.Output,
                                               Dict[Text, Any]]] = None,
                 custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                               Dict[Text, Any]]] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None,
                 enable_cache: Optional[bool] = None):
        """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
        If any field is provided as a RuntimeParameter, output_config should be
        constructed as a dict with the same field names as Output proto message.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing custom configuration for ExampleGen. If any field
          is provided as a RuntimeParameter, output_config should be
          constructed as a dict.
      example_artifacts: Channel of `standard_artifacts.Examples` for output
        train and eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        QueryBasedExampleGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = QueryBasedExampleGenSpec(input_config=input_config,
                                        output_config=output_config,
                                        custom_config=custom_config,
                                        examples=example_artifacts)
        super(_QueryBasedExampleGen,
              self).__init__(spec=spec,
                             instance_name=instance_name,
                             enable_cache=enable_cache)
Exemple #10
0
    def __init__(
            self,
            input_base: types.Channel = None,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            custom_config: Optional[example_gen_pb2.CustomConfig] = None,
            component_name: Optional[Text] = 'ExampleGen',
            example_artifacts: Optional[types.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside (required).
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      input: Forwards compatibility alias for the 'input_base' argument.
      name: Unique name for every component class instance.
    """
        input_base = input_base or input
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
Exemple #11
0
 def testMakeDefaultOutputConfigWithParameter(self):
   split_name_param = data_types.RuntimeParameter(
       name='split-name', ptype=str, default=u'train')
   output_config = utils.make_default_output_config({
       'splits': [{
           'name': split_name_param,
           'pattern': 'train/*'
       }, {
           'name': 'eval',
           'pattern': 'eval/*'
       }]
   })
   self.assertEqual(0, len(output_config.split_config.splits))
Exemple #12
0
    def __init__(self,
                 conn_config: presto_config_pb2.PrestoConnConfig,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Constructs a PrestoExampleGen component.

    Args:
      conn_config: Parameters for Presto connection client.
      query: Presto sql string, query result will be treated as a single split,
        can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        Presto sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Optional unique name. Necessary if multiple PrestoExampleGen
        components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set. Or
      required host field in connection_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        if not bool(conn_config.host):
            raise RuntimeError(
                'Required host field in connection config should be set.')

        input_config = input_config or utils.make_default_input_config(query)

        packed_custom_config = example_gen_pb2.CustomConfig()
        packed_custom_config.custom_config.Pack(conn_config)

        output_config = output_config or utils.make_default_output_config(
            input_config)

        super(PrestoExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             custom_config=packed_custom_config,
                             component_name='PrestoExampleGen',
                             example_artifacts=example_artifacts,
                             name=name)
    def __init__(
            self,
            input_example: channel.Channel,
            string_execution_parameter: Text,
            integer_execution_parameter: int,
            output_example: Optional[channel.Channel] = None,

            # don't change these three:
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            name: Optional[Text] = None):
        """Constructs a Head Component.

        Args:
          input_example: A Channel of 'RandomTypeNameForInput' type, (type can be any string, as long as it
            consistent in the channel, spec and artifacts)
          string_execution_parameter: An string execution parameter (only used in executor, not persistent or shared up stream)
          integer_execution_parameter: An integer execution parameter (only used in executor, not persistent or shared up stream)
          output_example: Optional output channel of 'RandomTypeNameForOutput' (type can be any string, as long as it
            consistent in the channel, spec and artifacts); will be created for you if not specified.
          input_config: An optional example_gen_pb2.Input instance, providing input
            configuration. If unset, the files under input_base (must set) will be
            treated as a single split.
          output_config: An optional example_gen_pb2.Output instance, providing
            output configuration. If unset, default splits will be 'train' and
        '   eval' with size 2:1.
          name: Optional unique name. Necessary if multiple Pusher components are
            declared in the same pipeline.
        """

        # Configure inputs and outputs (don't change).
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        output_example = output_example or channel.Channel(
            type_name='RandomTypeNameForOutput',
            artifacts=[types.TfxArtifact('RandomTypeNameForOutput')])

        spec = CustomHeadComponentSpec(
            input_example=input_example,
            integer_execution_parameter=integer_execution_parameter,
            string_execution_parameter=string_execution_parameter,
            input_config=input_config,
            output_config=output_config,
            output_example=output_example)

        super(CustomHeadComponent, self).__init__(spec=spec, name=name)
Exemple #14
0
 def __init__(self,
              query: Optional[Text] = None,
              input_config: Optional[example_gen_pb2.Input] = None,
              output_config: Optional[example_gen_pb2.Output] = None,
              name: Optional[Text] = None,
              outputs: Optional[base_component.ComponentOutputs] = None):
   if bool(query) == bool(input_config):
     raise RuntimeError('Only one of query and input_config should be set.')
   input_config = input_config or utils.make_default_input_config(query)
   output_config = output_config or utils.make_default_output_config(
       input_config)
   super(BigQueryExampleGen, self).__init__(
       executor=executor.Executor,
       input_base=None,
       input_config=input_config,
       output_config=output_config,
       component_name='BigQueryExampleGen',
       unique_name=name,
       outputs=outputs)
Exemple #15
0
    def __init__(
            self,
            input_base: channel.Channel,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            example_artifacts: Optional[channel.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
Exemple #16
0
  def __init__(self,
               conn_config: presto_config_pb2.PrestoConnConfig,
               query: Optional[str] = None,
               input_config: Optional[example_gen_pb2.Input] = None,
               output_config: Optional[example_gen_pb2.Output] = None):
    """Constructs a PrestoExampleGen component.

    Args:
      conn_config: Parameters for Presto connection client.
      query: Presto sql string, query result will be treated as a single split,
        can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        Presto sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.

    Raises:
      RuntimeError: Only one of query and input_config should be set. Or
      required host field in connection_config should be set.
    """
    if bool(query) == bool(input_config):
      raise RuntimeError('Exactly one of query and input_config should be set.')
    if not bool(conn_config.host):
      raise RuntimeError(
          'Required host field in connection config should be set.')

    input_config = input_config or utils.make_default_input_config(query)

    packed_custom_config = example_gen_pb2.CustomConfig()
    packed_custom_config.custom_config.Pack(conn_config)

    output_config = output_config or utils.make_default_output_config(
        input_config)

    super().__init__(
        input_config=input_config,
        output_config=output_config,
        custom_config=packed_custom_config)
Exemple #17
0
    def __init__(self,
                 input_base: channel.Channel,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(component_name=component_name,
                                       input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(_FileBasedExampleGen, self).__init__(spec=spec, name=name)
Exemple #18
0
  def __init__(self,
               input_config: example_gen_pb2.Input,
               output_config: Optional[example_gen_pb2.Output] = None,
               custom_config: Optional[example_gen_pb2.CustomConfig] = None,
               example_artifacts: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing input configuration. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing custom configuration for ExampleGen.
      example_artifacts: Channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
    # Configure outputs.
    output_config = output_config or utils.make_default_output_config(
        input_config)
    example_artifacts = example_artifacts or channel_utils.as_channel([
        standard_artifacts.Examples(split=split_name)
        for split_name in utils.generate_output_split_names(
            input_config, output_config)
    ])
    spec = QueryBasedExampleGenSpec(
        input_config=input_config,
        output_config=output_config,
        custom_config=custom_config,
        examples=example_artifacts)
    super(_QueryBasedExampleGen, self).__init__(
        spec=spec, instance_name=instance_name)
Exemple #19
0
    def __init__(self,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Optional unique name. Necessary if multiple BigQueryExampleGen
        components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        output_config = output_config or utils.make_default_output_config(
            input_config)
        super(BigQueryExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             component_name='BigQueryExampleGen',
                             example_artifacts=example_artifacts,
                             name=name)
Exemple #20
0
    def __init__(
            self,
            # TODO(b/159467778): deprecate this, use input_base instead.
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. (Deprecated by input_base)
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        if input:
            logging.warning(
                'The "input" argument to the ExampleGen component has been '
                'deprecated by "input_base". Please update your usage as support for '
                'this argument will be removed soon.')
            input_base = artifact_utils.get_single_uri(list(input.get()))
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
    def __init__(self,
                 query: Optional[Text] = None,
                 beam_transform: beam.PTransform = None,
                 bucket_name: Optional[Text] = None,
                 output_schema: Optional[Text] = None,
                 table_name: Optional[Text] = None,
                 use_bigquery_source: Optional[Any] = False,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Constructs a BigQueryExampleGen component.

        Args:
            query: BigQuery sql string, query result will be treated as a single
                split, can be overwritten by input_config.
                input_config: An example_gen_pb2.Input instance with Split.pattern as
                BigQuery sql string. If set, it overwrites the 'query' arg, and allows
                different queries per split. If any field is provided as a
                RuntimeParameter, input_config should be constructed as a dict with the
                same field names as Input proto message.
            beam_transform: beam.PTransform pipeline. Will be used to processed data ingested
                by the BigQuery query.
            bucket_name: string containing a GCS bucket name. Will be used as a temporary storage
                space to read query and pickle file.
            table_name: string containing the BigQuery output table name.
            use_bigquery_source: Whether to use BigQuerySource instead of experimental
                `ReadFromBigQuery` PTransform (required by the BigQueryExampleGen executor)
            input_config: An example_gen_pb2.Input instance with Split.pattern as
                BigQuery sql string. If set, it overwrites the 'query' arg, and allows
                different queries per split. If any field is provided as a
                RuntimeParameter, input_config should be constructed as a dict with the
                same field names as Input proto message.
            output_config: An example_gen_pb2.Output instance, providing output
                    configuration. If unset, default splits will be 'train' and 'eval' with
                    size 2:1. If any field is provided as a RuntimeParameter,
                    input_config should be constructed as a dict with the same field names
                    as Output proto message.
            example_artifacts: Optional channel of 'ExamplesPath' for output train and
                    eval examples.
            instance_name: Optional unique instance name. Necessary if multiple
                    BigQueryExampleGen components are declared in the same pipeline.

        Raises:
            RuntimeError: Only one of query and input_config should be set.
        """

        # Configure inputs and outputs
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        if not example_artifacts:
            example_artifacts = channel_utils.as_channel(
                [standard_artifacts.Examples()])

        # Upload Beam Transform to a GCS Bucket
        beam_transform_uri = upload_beam_to_gcs(beam_transform, bucket_name)

        spec = TCGAPreprocessingSpec(
            # custom parameters
            query=query,
            output_schema=output_schema,
            table_name=table_name,
            use_bigquery_source=use_bigquery_source,
            # default parameters
            input_config=input_config,
            output_config=output_config,
            input_base=beam_transform_uri,
            # outputs
            examples=example_artifacts)
        super(TCGAPreprocessing, self).__init__(spec=spec,
                                                instance_name=instance_name)
Exemple #22
0
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None,
            enable_cache: Optional[bool] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        FileBasedExampleGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        if input_base:
            absl.logging.warning(
                'The "input_base" argument to the ExampleGen component has '
                'been renamed to "input" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            input = input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = FileBasedExampleGenSpec(input=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name,
                             enable_cache=enable_cache)