コード例 #1
0
ファイル: component.py プロジェクト: zorrock/tfx
 def __init__(self,
              executor: Any,
              input_base: Optional[channel.Channel] = None,
              input_config: Optional[example_gen_pb2.Input] = None,
              output_config: Optional[example_gen_pb2.Output] = None,
              component_name: Optional[Text] = 'ExampleGen',
              unique_name: Optional[Text] = None,
              outputs: Optional[base_component.ComponentOutputs] = None):
     if input_base is None and input_config is None:
         raise RuntimeError(
             'One of input_base and input_config must be set.')
     input_dict = {
         'input-base': channel.as_channel(input_base)
     } if input_base else {}
     # Default value need to be set in component instead of executor as output
     # artifacts depend on it.
     self._input_config = input_config or utils.make_default_input_config()
     self._output_config = output_config or utils.make_default_output_config(
         self._input_config)
     exec_properties = {
         'input': json_format.MessageToJson(self._input_config),
         'output': json_format.MessageToJson(self._output_config)
     }
     super(ExampleGen, self).__init__(
         component_name=component_name,
         unique_name=unique_name,
         driver=driver.Driver if input_base else base_driver.BaseDriver,
         executor=executor,
         input_dict=input_dict,
         outputs=outputs,
         exec_properties=exec_properties)
コード例 #2
0
ファイル: component.py プロジェクト: NunoEdgarGFlowHub/tfx
    def __init__(self,
                 input_config: example_gen_pb2.Input,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct an ExampleGen component.

    Args:
      input_config: An example_gen_pb2.Input instance, providing input
        configuration.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = ExampleGenSpec(component_name=component_name,
                              input_config=input_config,
                              output_config=output_config,
                              examples=example_artifacts)
        super(_ExampleGen, self).__init__(spec=spec, name=name)
コード例 #3
0
    def __init__(self,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Optional unique name. Necessary if multiple BigQueryExampleGen
        components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        super(BigQueryExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             example_artifacts=example_artifacts,
                             name=name)
コード例 #4
0
ファイル: component.py プロジェクト: jeongukjae/tfx
    def __init__(self,
                 query: Optional[str] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter,
        input_config should be constructed as a dict with the same field names
        as Output proto message.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        super(BigQueryExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             example_artifacts=example_artifacts)
コード例 #5
0
ファイル: component.py プロジェクト: edmontdants/tfx
    def __init__(
            self,
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       range_config=range_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
コード例 #6
0
ファイル: component.py プロジェクト: etarakci-hvl/tfx
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing
        the data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
    """
        input = input or input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=str(split_name))
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
コード例 #7
0
    def testMakeDefaultOutputConfig(self):
        output_config = utils.make_default_output_config(
            utils.make_default_input_config())
        self.assertEqual(2, len(output_config.split_config.splits))

        output_config = utils.make_default_output_config(
            example_gen_pb2.Input(splits=[
                example_gen_pb2.Input.Split(name='train', pattern='train/*'),
                example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
            ]))
        self.assertEqual(0, len(output_config.split_config.splits))
コード例 #8
0
    def __init__(
            self,
            input_base: Optional[str] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         data_types.RuntimeParameter]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          data_types.RuntimeParameter]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          data_types.RuntimeParameter]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         data_types.RuntimeParameter]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            output_file_format: Optional[int] = example_gen_pb2.
        FORMAT_TFRECORDS_GZIP,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      output_file_format: File format of generated data in output artifact,
        one of example_gen_pb2.FileFormat enum.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = standard_component_specs.FileBasedExampleGenSpec(
            input_base=input_base,
            input_config=input_config,
            output_config=output_config,
            custom_config=custom_config,
            range_config=range_config,
            output_data_format=output_data_format,
            output_file_format=output_file_format,
            examples=example_artifacts)
        super().__init__(spec=spec, custom_executor_spec=custom_executor_spec)
コード例 #9
0
    def __init__(self,
                 query: Optional[Text] = None,
                 elwc_config: Optional[elwc_config_pb2.ElwcConfig] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Constructs a BigQueryElwcExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      elwc_config: The elwc config contains a list of context feature fields.
        The fields are used to build context feature. Examples with the same
        context feature will be converted to an ELWC(ExampleListWithContext)
        instance. For example, when there are two examples with the same context
        field, the two examples will be intergrated to a ELWC instance.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter, input_config
          should be constructed as a dict with the same field names as Output
          proto message.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Necessary if multiple
        BigQueryExampleGen components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set and
        elwc_config is required.
    """

        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        if not elwc_config:
            raise RuntimeError(
                'elwc_config is required for BigQueryToElwcExampleGen.')
        input_config = input_config or utils.make_default_input_config(query)
        packed_custom_config = example_gen_pb2.CustomConfig()
        packed_custom_config.custom_config.Pack(elwc_config)
        super(BigQueryToElwcExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             output_data_format=example_gen_pb2.FORMAT_PROTO,
                             custom_config=packed_custom_config,
                             example_artifacts=example_artifacts,
                             instance_name=instance_name)
コード例 #10
0
    def __init__(
            self,
            input_base: types.Channel = None,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            custom_config: Optional[example_gen_pb2.CustomConfig] = None,
            component_name: Optional[Text] = 'ExampleGen',
            example_artifacts: Optional[types.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside (required).
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      input: Forwards compatibility alias for the 'input_base' argument.
      name: Unique name for every component class instance.
    """
        input_base = input_base or input
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
コード例 #11
0
    def __init__(self,
                 conn_config: presto_config_pb2.PrestoConnConfig,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Constructs a PrestoExampleGen component.

    Args:
      conn_config: Parameters for Presto connection client.
      query: Presto sql string, query result will be treated as a single split,
        can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        Presto sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Optional unique name. Necessary if multiple PrestoExampleGen
        components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set. Or
      required host field in connection_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        if not bool(conn_config.host):
            raise RuntimeError(
                'Required host field in connection config should be set.')

        input_config = input_config or utils.make_default_input_config(query)

        packed_custom_config = example_gen_pb2.CustomConfig()
        packed_custom_config.custom_config.Pack(conn_config)

        output_config = output_config or utils.make_default_output_config(
            input_config)

        super(PrestoExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             custom_config=packed_custom_config,
                             component_name='PrestoExampleGen',
                             example_artifacts=example_artifacts,
                             name=name)
コード例 #12
0
    def __init__(
            self,
            input_example: channel.Channel,
            string_execution_parameter: Text,
            integer_execution_parameter: int,
            output_example: Optional[channel.Channel] = None,

            # don't change these three:
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            name: Optional[Text] = None):
        """Constructs a Head Component.

        Args:
          input_example: A Channel of 'RandomTypeNameForInput' type, (type can be any string, as long as it
            consistent in the channel, spec and artifacts)
          string_execution_parameter: An string execution parameter (only used in executor, not persistent or shared up stream)
          integer_execution_parameter: An integer execution parameter (only used in executor, not persistent or shared up stream)
          output_example: Optional output channel of 'RandomTypeNameForOutput' (type can be any string, as long as it
            consistent in the channel, spec and artifacts); will be created for you if not specified.
          input_config: An optional example_gen_pb2.Input instance, providing input
            configuration. If unset, the files under input_base (must set) will be
            treated as a single split.
          output_config: An optional example_gen_pb2.Output instance, providing
            output configuration. If unset, default splits will be 'train' and
        '   eval' with size 2:1.
          name: Optional unique name. Necessary if multiple Pusher components are
            declared in the same pipeline.
        """

        # Configure inputs and outputs (don't change).
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        output_example = output_example or channel.Channel(
            type_name='RandomTypeNameForOutput',
            artifacts=[types.TfxArtifact('RandomTypeNameForOutput')])

        spec = CustomHeadComponentSpec(
            input_example=input_example,
            integer_execution_parameter=integer_execution_parameter,
            string_execution_parameter=string_execution_parameter,
            input_config=input_config,
            output_config=output_config,
            output_example=output_example)

        super(CustomHeadComponent, self).__init__(spec=spec, name=name)
コード例 #13
0
ファイル: component.py プロジェクト: zorrock/tfx
 def __init__(self,
              query: Optional[Text] = None,
              input_config: Optional[example_gen_pb2.Input] = None,
              output_config: Optional[example_gen_pb2.Output] = None,
              name: Optional[Text] = None,
              outputs: Optional[base_component.ComponentOutputs] = None):
   if bool(query) == bool(input_config):
     raise RuntimeError('Only one of query and input_config should be set.')
   input_config = input_config or utils.make_default_input_config(query)
   output_config = output_config or utils.make_default_output_config(
       input_config)
   super(BigQueryExampleGen, self).__init__(
       executor=executor.Executor,
       input_base=None,
       input_config=input_config,
       output_config=output_config,
       component_name='BigQueryExampleGen',
       unique_name=name,
       outputs=outputs)
コード例 #14
0
ファイル: component.py プロジェクト: sushantjha8/tfx
    def __init__(
            self,
            input_base: channel.Channel,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            example_artifacts: Optional[channel.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
コード例 #15
0
  def __init__(self,
               conn_config: presto_config_pb2.PrestoConnConfig,
               query: Optional[str] = None,
               input_config: Optional[example_gen_pb2.Input] = None,
               output_config: Optional[example_gen_pb2.Output] = None):
    """Constructs a PrestoExampleGen component.

    Args:
      conn_config: Parameters for Presto connection client.
      query: Presto sql string, query result will be treated as a single split,
        can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        Presto sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.

    Raises:
      RuntimeError: Only one of query and input_config should be set. Or
      required host field in connection_config should be set.
    """
    if bool(query) == bool(input_config):
      raise RuntimeError('Exactly one of query and input_config should be set.')
    if not bool(conn_config.host):
      raise RuntimeError(
          'Required host field in connection config should be set.')

    input_config = input_config or utils.make_default_input_config(query)

    packed_custom_config = example_gen_pb2.CustomConfig()
    packed_custom_config.custom_config.Pack(conn_config)

    output_config = output_config or utils.make_default_output_config(
        input_config)

    super().__init__(
        input_config=input_config,
        output_config=output_config,
        custom_config=packed_custom_config)
コード例 #16
0
ファイル: component.py プロジェクト: NunoEdgarGFlowHub/tfx
    def __init__(self,
                 input_base: channel.Channel,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(component_name=component_name,
                                       input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(_FileBasedExampleGen, self).__init__(spec=spec, name=name)
コード例 #17
0
ファイル: component.py プロジェクト: jay90099/tfx
    def __init__(
        self,
        query: Optional[str] = None,
        input_config: Optional[Union[example_gen_pb2.Input,
                                     data_types.RuntimeParameter]] = None,
        output_config: Optional[Union[example_gen_pb2.Output,
                                      data_types.RuntimeParameter]] = None,
        range_config: Optional[Union[range_config_pb2.RangeConfig,
                                     data_types.RuntimeParameter]] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter,
        input_config should be constructed as a dict with the same field names
        as Output proto message.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        super().__init__(input_config=input_config,
                         output_config=output_config,
                         range_config=range_config)
コード例 #18
0
    def __init__(
            self,
            # TODO(b/159467778): deprecate this, use input_base instead.
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. (Deprecated by input_base)
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        if input:
            logging.warning(
                'The "input" argument to the ExampleGen component has been '
                'deprecated by "input_base". Please update your usage as support for '
                'this argument will be removed soon.')
            input_base = artifact_utils.get_single_uri(list(input.get()))
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
コード例 #19
0
    def __init__(self,
                 query: Optional[Text] = None,
                 beam_transform: beam.PTransform = None,
                 bucket_name: Optional[Text] = None,
                 output_schema: Optional[Text] = None,
                 table_name: Optional[Text] = None,
                 use_bigquery_source: Optional[Any] = False,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Constructs a BigQueryExampleGen component.

        Args:
            query: BigQuery sql string, query result will be treated as a single
                split, can be overwritten by input_config.
                input_config: An example_gen_pb2.Input instance with Split.pattern as
                BigQuery sql string. If set, it overwrites the 'query' arg, and allows
                different queries per split. If any field is provided as a
                RuntimeParameter, input_config should be constructed as a dict with the
                same field names as Input proto message.
            beam_transform: beam.PTransform pipeline. Will be used to processed data ingested
                by the BigQuery query.
            bucket_name: string containing a GCS bucket name. Will be used as a temporary storage
                space to read query and pickle file.
            table_name: string containing the BigQuery output table name.
            use_bigquery_source: Whether to use BigQuerySource instead of experimental
                `ReadFromBigQuery` PTransform (required by the BigQueryExampleGen executor)
            input_config: An example_gen_pb2.Input instance with Split.pattern as
                BigQuery sql string. If set, it overwrites the 'query' arg, and allows
                different queries per split. If any field is provided as a
                RuntimeParameter, input_config should be constructed as a dict with the
                same field names as Input proto message.
            output_config: An example_gen_pb2.Output instance, providing output
                    configuration. If unset, default splits will be 'train' and 'eval' with
                    size 2:1. If any field is provided as a RuntimeParameter,
                    input_config should be constructed as a dict with the same field names
                    as Output proto message.
            example_artifacts: Optional channel of 'ExamplesPath' for output train and
                    eval examples.
            instance_name: Optional unique instance name. Necessary if multiple
                    BigQueryExampleGen components are declared in the same pipeline.

        Raises:
            RuntimeError: Only one of query and input_config should be set.
        """

        # Configure inputs and outputs
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        if not example_artifacts:
            example_artifacts = channel_utils.as_channel(
                [standard_artifacts.Examples()])

        # Upload Beam Transform to a GCS Bucket
        beam_transform_uri = upload_beam_to_gcs(beam_transform, bucket_name)

        spec = TCGAPreprocessingSpec(
            # custom parameters
            query=query,
            output_schema=output_schema,
            table_name=table_name,
            use_bigquery_source=use_bigquery_source,
            # default parameters
            input_config=input_config,
            output_config=output_config,
            input_base=beam_transform_uri,
            # outputs
            examples=example_artifacts)
        super(TCGAPreprocessing, self).__init__(spec=spec,
                                                instance_name=instance_name)
コード例 #20
0
ファイル: component.py プロジェクト: wendy2003888/tfx
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None,
            enable_cache: Optional[bool] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        FileBasedExampleGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        if input_base:
            absl.logging.warning(
                'The "input_base" argument to the ExampleGen component has '
                'been renamed to "input" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            input = input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = FileBasedExampleGenSpec(input=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name,
                             enable_cache=enable_cache)