Beispiel #1
0
    def __init__(self,
                 input_config: example_gen_pb2.Input,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An example_gen_pb2.Input instance, providing input
        configuration.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = QueryBasedExampleGenSpec(component_name=component_name,
                                        input_config=input_config,
                                        output_config=output_config,
                                        examples=example_artifacts)
        super(_QueryBasedExampleGen, self).__init__(spec=spec, name=name)
Beispiel #2
0
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing
        the data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
    """
        input = input or input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=str(split_name))
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
Beispiel #3
0
  def testMakeOutputSplitNames(self):
    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='train', pattern='train/*'),
            example_gen_pb2.Input.Split(name='eval', pattern='eval/*')
        ]),
        output_config=example_gen_pb2.Output())
    self.assertListEqual(['train', 'eval'], split_names)

    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='single', pattern='single/*')
        ]),
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=2),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ])))
    self.assertListEqual(['train', 'eval'], split_names)
Beispiel #4
0
  def testMakeOutputSplitNamesWithParameter(self):
    split_name_param = data_types.RuntimeParameter(
        name='split-name', ptype=str, default=u'train')
    split_names = utils.generate_output_split_names(
        input_config={
            'splits': [{
                'name': split_name_param,
                'pattern': 'train/*'
            }, {
                'name': 'eval',
                'pattern': 'eval/*'
            }]
        },
        output_config=example_gen_pb2.Output())
    # Assert the json serialized version because RuntimeParameters only get
    # serialized after that.
    self.assertEqual(
        json_utils.dumps([split_name_param, 'eval']),
        json_utils.dumps(split_names))

    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='single', pattern='single/*')
        ]),
        output_config={
            'split_config': {
                'splits': [{
                    'name': split_name_param,
                    'hash_buckets': 2
                }, {
                    'name': 'eval',
                    'hash_buckets': 1
                }]
            }
        })
    # Assert the json serialized version because RuntimeParameters only get
    # serialized after that.
    self.assertEqual(
        json_utils.dumps([split_name_param, 'eval']),
        json_utils.dumps(split_names))
Beispiel #5
0
    def __init__(self,
                 input_config: Union[example_gen_pb2.Input, Dict[Text, Any]],
                 output_config: Optional[Union[example_gen_pb2.Output,
                                               Dict[Text, Any]]] = None,
                 custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                               Dict[Text, Any]]] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None,
                 enable_cache: Optional[bool] = None):
        """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
        If any field is provided as a RuntimeParameter, output_config should be
        constructed as a dict with the same field names as Output proto message.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing custom configuration for ExampleGen. If any field
          is provided as a RuntimeParameter, output_config should be
          constructed as a dict.
      example_artifacts: Channel of `standard_artifacts.Examples` for output
        train and eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        QueryBasedExampleGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        # Configure outputs.
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = QueryBasedExampleGenSpec(input_config=input_config,
                                        output_config=output_config,
                                        custom_config=custom_config,
                                        examples=example_artifacts)
        super(_QueryBasedExampleGen,
              self).__init__(spec=spec,
                             instance_name=instance_name,
                             enable_cache=enable_cache)
Beispiel #6
0
    def __init__(
            self,
            input_base: types.Channel = None,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            custom_config: Optional[example_gen_pb2.CustomConfig] = None,
            component_name: Optional[Text] = 'ExampleGen',
            example_artifacts: Optional[types.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside (required).
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      input: Forwards compatibility alias for the 'input_base' argument.
      name: Unique name for every component class instance.
    """
        input_base = input_base or input
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel_utils.as_channel([
            standard_artifacts.Examples(split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
Beispiel #7
0
    def _create_outputs(self) -> base_component.ComponentOutputs:
        """Creates outputs for ExampleGen.

    Returns:
      ComponentOutputs object containing the dict of [Text -> Channel]
    """
        output_artifact_collection = [
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                self._input_config, self._output_config)
        ]
        return base_component.ComponentOutputs({
            'examples':
            channel.Channel(
                type_name='ExamplesPath',
                static_artifact_collection=output_artifact_collection)
        })
Beispiel #8
0
    def __init__(
            self,
            input_base: channel.Channel,
            input_config: Optional[example_gen_pb2.Input] = None,
            output_config: Optional[example_gen_pb2.Output] = None,
            example_artifacts: Optional[channel.Channel] = None,
            executor_class: Optional[Type[base_executor.BaseExecutor]] = None,
            name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      executor_class: Optional custom executor class overriding the default
        executor specified in the component attribute.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_class=executor_class,
                             name=name)
Beispiel #9
0
    def __init__(self,
                 input_base: channel.Channel,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 component_name: Optional[Text] = 'ExampleGen',
                 example_artifacts: Optional[channel.Channel] = None,
                 name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input_base: A Channel of 'ExternalPath' type, which includes one artifact
        whose uri is an external directory with data files inside.
      input_config: An optional example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base (must set) will be
        treated as a single split.
      output_config: An optional example_gen_pb2.Output instance, providing
        output configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      component_name: Name of the component, should be unique per component
        class. Default to 'ExampleGen', can be overwritten by sub-classes.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      name: Unique name for every component class instance.
    """
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        example_artifacts = example_artifacts or channel.as_channel([
            types.TfxArtifact('ExamplesPath', split=split_name)
            for split_name in utils.generate_output_split_names(
                input_config, output_config)
        ])
        spec = FileBasedExampleGenSpec(component_name=component_name,
                                       input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       examples=example_artifacts)
        super(_FileBasedExampleGen, self).__init__(spec=spec, name=name)
Beispiel #10
0
  def __init__(self,
               input_config: example_gen_pb2.Input,
               output_config: Optional[example_gen_pb2.Output] = None,
               custom_config: Optional[example_gen_pb2.CustomConfig] = None,
               example_artifacts: Optional[types.Channel] = None,
               instance_name: Optional[Text] = None):
    """Construct an QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing input configuration. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
        instance, providing custom configuration for ExampleGen.
      example_artifacts: Channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
    # Configure outputs.
    output_config = output_config or utils.make_default_output_config(
        input_config)
    example_artifacts = example_artifacts or channel_utils.as_channel([
        standard_artifacts.Examples(split=split_name)
        for split_name in utils.generate_output_split_names(
            input_config, output_config)
    ])
    spec = QueryBasedExampleGenSpec(
        input_config=input_config,
        output_config=output_config,
        custom_config=custom_config,
        examples=example_artifacts)
    super(_QueryBasedExampleGen, self).__init__(
        spec=spec, instance_name=instance_name)
Beispiel #11
0
  def GenerateExamplesByBeam(
      self,
      pipeline: beam.Pipeline,
      exec_properties: Dict[Text, Any],
  ) -> Dict[Text, beam.pvalue.PCollection]:
    """Converts input source to serialized record splits based on configs.

    Custom ExampleGen executor should provide GetInputSourceToExamplePTransform
    for converting input split to serialized records. Overriding this
    'GenerateExamplesByBeam' method instead if complex logic is need, e.g.,
    custom spliting logic.

    Args:
      pipeline: Beam pipeline.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input_base: an external directory containing the data files.
        - input_config: JSON string of example_gen_pb2.Input instance, providing
          input configuration.
        - output_config: JSON string of example_gen_pb2.Output instance,
          providing output configuration.
        - output_data_format: Payload format of generated data in output
          artifact, one of example_gen_pb2.PayloadFormat enum.

    Returns:
      Dict of beam PCollection with split name as key, each PCollection is a
      single output split that contains serialized records.
    """
    # Get input split information.
    input_config = example_gen_pb2.Input()
    json_format.Parse(exec_properties[utils.INPUT_CONFIG_KEY], input_config)
    # Get output split information.
    output_config = example_gen_pb2.Output()
    json_format.Parse(exec_properties[utils.OUTPUT_CONFIG_KEY], output_config)
    # Get output split names.
    split_names = utils.generate_output_split_names(input_config, output_config)
    # Make beam_pipeline_args available in exec_properties since certain
    # example_gen executors need this information.
    # TODO(b/155441037): Revisit necessity of this when BigQueryExampleGen
    # does not branch on runner anymore.
    exec_properties['_beam_pipeline_args'] = self._beam_pipeline_args or []

    example_splits = []
    input_to_record = self.GetInputSourceToExamplePTransform()
    if output_config.split_config.splits:
      # Use output splits, input must have only one split.
      assert len(
          input_config.splits
      ) == 1, 'input must have only one split when output split is specified.'
      # Calculate split buckets.
      buckets = []
      total_buckets = 0
      for split in output_config.split_config.splits:
        total_buckets += split.hash_buckets
        buckets.append(total_buckets)
      example_splits = (
          pipeline
          | 'InputToRecord' >>
          # pylint: disable=no-value-for-parameter
          input_to_record(exec_properties, input_config.splits[0].pattern)
          | 'SplitData' >> beam.Partition(_PartitionFn, len(buckets), buckets,
                                          output_config.split_config))
    else:
      # Use input splits.
      for split in input_config.splits:
        examples = (
            pipeline
            | 'InputToRecord[{}]'.format(split.name) >>
            # pylint: disable=no-value-for-parameter
            input_to_record(exec_properties, split.pattern))
        example_splits.append(examples)

    result = {}
    for index, example_split in enumerate(example_splits):
      result[split_names[index]] = example_split
    return result
Beispiel #12
0
  def Do(
      self,
      input_dict: Dict[Text, List[types.Artifact]],
      output_dict: Dict[Text, List[types.Artifact]],
      exec_properties: Dict[Text, Any],
  ) -> None:
    """Take input data source and generates serialized data splits.

    The output is intended to be serialized tf.train.Examples or
    tf.train.SequenceExamples protocol buffer in gzipped TFRecord format,
    but subclasses can choose to override to write to any serialized records
    payload into gzipped TFRecord as specified, so long as downstream
    component can consume it. The format of payload is added to
    `payload_format` custom property of the output Example artifact.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: splits of serialized records.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input_base: an external directory containing the data files.
        - input_config: JSON string of example_gen_pb2.Input instance,
          providing input configuration.
        - output_config: JSON string of example_gen_pb2.Output instance,
          providing output configuration.
        - output_data_format: Payload format of generated data in output
          artifact, one of example_gen_pb2.PayloadFormat enum.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    input_config = example_gen_pb2.Input()
    proto_utils.json_to_proto(exec_properties[utils.INPUT_CONFIG_KEY],
                              input_config)
    output_config = example_gen_pb2.Output()
    proto_utils.json_to_proto(exec_properties[utils.OUTPUT_CONFIG_KEY],
                              output_config)

    examples_artifact = artifact_utils.get_single_instance(
        output_dict[utils.EXAMPLES_KEY])
    examples_artifact.split_names = artifact_utils.encode_split_names(
        utils.generate_output_split_names(input_config, output_config))

    logging.info('Generating examples.')
    with self._make_beam_pipeline() as pipeline:
      example_splits = self.GenerateExamplesByBeam(pipeline, exec_properties)

      # pylint: disable=expression-not-assigned, no-value-for-parameter
      for split_name, example_split in example_splits.items():
        (example_split
         | 'WriteSplit[{}]'.format(split_name) >> _WriteSplit(
             artifact_utils.get_split_uri(output_dict[utils.EXAMPLES_KEY],
                                          split_name)))
      # pylint: enable=expression-not-assigned, no-value-for-parameter

    output_payload_format = exec_properties.get(utils.OUTPUT_DATA_FORMAT_KEY)
    if output_payload_format:
      for output_examples_artifact in output_dict[utils.EXAMPLES_KEY]:
        examples_utils.set_payload_format(
            output_examples_artifact, output_payload_format)
    logging.info('Examples generated.')
Beispiel #13
0
    def GenerateExamplesByBeam(
        self, pipeline: beam.Pipeline, input_dict: Dict[Text,
                                                        List[types.Artifact]],
        exec_properties: Dict[Text,
                              Any]) -> Dict[Text, beam.pvalue.PCollection]:
        """Converts input source to TF example splits based on configs.

    Custom ExampleGen executor should provide GetInputSourceToExamplePTransform
    for converting input split to TF Examples. Overriding this
    'GenerateExamplesByBeam' method instead if complex logic is need, e.g.,
    custom spliting logic.

    Args:
      pipeline: beam pipeline.
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input: JSON string of example_gen_pb2.Input instance, providing input
          configuration.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      Dict of beam PCollection with split name as key, each PCollection is a
      single output split that contains serialized TF Examples.
    """
        # Get input split information.
        input_config = example_gen_pb2.Input()
        json_format.Parse(exec_properties['input_config'], input_config)
        # Get output split information.
        output_config = example_gen_pb2.Output()
        json_format.Parse(exec_properties['output_config'], output_config)
        # Get output split names.
        split_names = utils.generate_output_split_names(
            input_config, output_config)

        example_splits = []
        input_to_example = self.GetInputSourceToExamplePTransform()
        if output_config.split_config.splits:
            # Use output splits, input must have only one split.
            assert len(
                input_config.splits
            ) == 1, 'input must have only one split when output split is specified.'
            # Calculate split buckets.
            buckets = []
            total_buckets = 0
            for split in output_config.split_config.splits:
                total_buckets += split.hash_buckets
                buckets.append(total_buckets)
            example_splits = (
                pipeline
                | 'InputToSerializedExample' >> _InputToSerializedExample(  # pylint: disable=no-value-for-parameter
                    input_to_example, input_dict, exec_properties,
                    input_config.splits[0].pattern)
                | 'SplitData' >> beam.Partition(_PartitionFn, len(buckets),
                                                buckets))
        else:
            # Use input splits.
            for split in input_config.splits:
                examples = (
                    pipeline
                    | 'InputToSerializedExample' + split.name >>
                    _InputToSerializedExample(  # pylint: disable=no-value-for-parameter
                        input_to_example, input_dict, exec_properties,
                        split.pattern))
                example_splits.append(examples)

        result = {}
        for index, example_split in enumerate(example_splits):
            result[split_names[index]] = example_split
        return result
Beispiel #14
0
    def __init__(
            self,
            input: types.Channel = None,  # pylint: disable=redefined-builtin
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            input_base: Optional[types.Channel] = None,
            instance_name: Optional[Text] = None,
            enable_cache: Optional[bool] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. _required_
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, the files under
          input_base will be treated as a single dataset.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      input_base: Backwards compatibility alias for the 'input' argument.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.  Either
        `input_base` or `input` must be present in the input arguments.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        FileBasedExampleGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        if input_base:
            absl.logging.warning(
                'The "input_base" argument to the ExampleGen component has '
                'been renamed to "input" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            input = input_base
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = FileBasedExampleGenSpec(input=input,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name,
                             enable_cache=enable_cache)
Beispiel #15
0
    def __init__(
            self,
            # TODO(b/159467778): deprecate this, use input_base instead.
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. (Deprecated by input_base)
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        if input:
            logging.warning(
                'The "input" argument to the ExampleGen component has been '
                'deprecated by "input_base". Please update your usage as support for '
                'this argument will be removed soon.')
            input_base = artifact_utils.get_single_uri(list(input.get()))
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)
        if not example_artifacts:
            artifact = standard_artifacts.Examples()
            artifact.split_names = artifact_utils.encode_split_names(
                utils.generate_output_split_names(input_config, output_config))
            example_artifacts = channel_utils.as_channel([artifact])
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)