Esempio n. 1
0
    def __init__(self,
                 examples: types.Channel = None,
                 output: Optional[types.Channel] = None,
                 input_data: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a StatisticsGen component.

    Args:
      examples: A Channel of `ExamplesPath` type, likely generated by the
        [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).
        This needs to contain two splits labeled `train` and `eval`. _required_
      output: `ExampleStatisticsPath` channel for statistics of each split
        provided in the input examples.
      input_data: Backwards compatibility alias for the `examples` argument.
      instance_name: Optional name assigned to this specific instance of
        StatisticsGen.  Required only if multiple StatisticsGen components are
        declared in the same pipeline.
    """
        examples = examples or input_data
        if not output:
            statistics_artifact = standard_artifacts.ExampleStatistics()
            statistics_artifact.split_names = artifact_utils.encode_split_names(
                artifact.DEFAULT_EXAMPLE_SPLITS)
            output = types.Channel(type=standard_artifacts.ExampleStatistics,
                                   artifacts=[statistics_artifact])
        spec = StatisticsGenSpec(input_data=examples, output=output)
        super(StatisticsGen, self).__init__(spec=spec,
                                            instance_name=instance_name)
Esempio n. 2
0
  def __init__(self,
               examples: types.Channel,
               schema: Optional[types.Channel] = None,
               stats_options: Optional[tfdv.StatsOptions] = None,
               exclude_splits: Optional[List[Text]] = None):
    """Construct a StatisticsGen component.

    Args:
      examples: A Channel of `ExamplesPath` type, likely generated by the
        [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).
        This needs to contain two splits labeled `train` and `eval`. _required_
      schema: A `Schema` channel to use for automatically configuring the value
        of stats options passed to TFDV.
      stats_options: The StatsOptions instance to configure optional TFDV
        behavior. When stats_options.schema is set, it will be used instead of
        the `schema` channel input. Due to the requirement that stats_options be
        serialized, the slicer functions and custom stats generators are dropped
        and are therefore not usable.
      exclude_splits: Names of splits where statistics and sample should not
        be generated. Default behavior (when exclude_splits is set to None)
        is excluding no splits.
    """
    if exclude_splits is None:
      exclude_splits = []
      logging.info('Excluding no splits because exclude_splits is not set.')
    statistics = types.Channel(type=standard_artifacts.ExampleStatistics)
    # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils.
    stats_options_json = stats_options.to_json() if stats_options else None
    spec = StatisticsGenSpec(
        examples=examples,
        schema=schema,
        stats_options_json=stats_options_json,
        exclude_splits=json_utils.dumps(exclude_splits),
        statistics=statistics)
    super(StatisticsGen, self).__init__(spec=spec)
Esempio n. 3
0
  def __init__(self,
               input_data: types.Channel = None,
               output: Optional[types.Channel] = None,
               examples: Optional[types.Channel] = None,
               name: Optional[Text] = None):
    """Construct a StatisticsGen component.

    Args:
      input_data: A Channel of 'ExamplesPath' type. This should contain two
        splits 'train' and 'eval' (required).
      output: Optional 'ExampleStatisticsPath' channel for statistics of each
        split provided in input examples.
      examples: Forwards compatibility alias for the 'input_data' argument.
      name: Optional unique name. Necessary iff multiple StatisticsGen
        components are declared in the same pipeline.
    """
    input_data = input_data or examples
    output = output or types.Channel(
        type=standard_artifacts.ExampleStatistics,
        artifacts=[
            standard_artifacts.ExampleStatistics(split=split)
            for split in artifact.DEFAULT_EXAMPLE_SPLITS
        ])
    spec = StatisticsGenSpec(
        input_data=input_data, output=output)
    super(StatisticsGen, self).__init__(spec=spec, name=name)
Esempio n. 4
0
    def __init__(self,
                 examples: types.Channel = None,
                 schema: Optional[types.Channel] = None,
                 stats_options: Optional[tfdv.StatsOptions] = None,
                 exclude_splits: Optional[List[Text]] = None,
                 output: Optional[types.Channel] = None,
                 input_data: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct a StatisticsGen component.

    Args:
      examples: A Channel of `ExamplesPath` type, likely generated by the
        [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).
        This needs to contain two splits labeled `train` and `eval`. _required_
      schema: A `Schema` channel to use for automatically configuring the value
        of stats options passed to TFDV.
      stats_options: The StatsOptions instance to configure optional TFDV
        behavior. When stats_options.schema is set, it will be used instead of
        the `schema` channel input. Due to the requirement that stats_options be
        serialized, the slicer functions and custom stats generators are dropped
        and are therefore not usable.
      exclude_splits: Names of splits where statistics and sample should not
        be generated. Default behavior (when exclude_splits is set to None)
        is excluding no splits.
      output: `ExampleStatisticsPath` channel for statistics of each split
        provided in the input examples.
      input_data: Backwards compatibility alias for the `examples` argument.
      instance_name: Optional name assigned to this specific instance of
        StatisticsGen.  Required only if multiple StatisticsGen components are
        declared in the same pipeline.
    """
        if input_data:
            logging.warning(
                'The "input_data" argument to the StatisticsGen component has '
                'been renamed to "examples" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            examples = input_data
        if exclude_splits is None:
            exclude_splits = []
            logging.info(
                'Excluding no splits because exclude_splits is not set.')
        if not output:
            output = channel_utils.as_channel(
                [standard_artifacts.ExampleStatistics()])
        # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils.
        stats_options_json = stats_options.to_json() if stats_options else None
        spec = StatisticsGenSpec(
            examples=examples,
            schema=schema,
            stats_options_json=stats_options_json,
            exclude_splits=json_utils.dumps(exclude_splits),
            statistics=output)
        super(StatisticsGen, self).__init__(spec=spec,
                                            instance_name=instance_name)
Esempio n. 5
0
    def __init__(self,
                 examples: types.Channel = None,
                 schema: Optional[types.Channel] = None,
                 stats_options: Optional[tfdv.StatsOptions] = None,
                 output: Optional[types.Channel] = None,
                 input_data: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None,
                 enable_cache: Optional[bool] = None):
        """Construct a StatisticsGen component.

    Args:
      examples: A Channel of `ExamplesPath` type, likely generated by the
        [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).
        This needs to contain two splits labeled `train` and `eval`. _required_
      schema: A `Schema` channel to use for automatically configuring the value
        of stats options passed to TFDV.
      stats_options: The StatsOptions instance to configure optional TFDV
        behavior. When stats_options.schema is set, it will be used instead of
        the `schema` channel input. Due to the requirement that stats_options be
        serialized, the slicer functions and custom stats generators are dropped
        and are therefore not usable.
      output: `ExampleStatisticsPath` channel for statistics of each split
        provided in the input examples.
      input_data: Backwards compatibility alias for the `examples` argument.
      instance_name: Optional name assigned to this specific instance of
        StatisticsGen.  Required only if multiple StatisticsGen components are
        declared in the same pipeline.
      enable_cache: Optional boolean to indicate if cache is enabled for the
        StatisticsGen component. If not specified, defaults to the value
        specified for pipeline's enable_cache parameter.
    """
        if input_data:
            absl.logging.warning(
                'The "input_data" argument to the StatisticsGen component has '
                'been renamed to "examples" and is deprecated. Please update your '
                'usage as support for this argument will be removed soon.')
            examples = input_data
        if not output:
            statistics_artifact = standard_artifacts.ExampleStatistics()
            statistics_artifact.split_names = artifact_utils.get_single_instance(
                list(examples.get())).split_names
            output = types.Channel(type=standard_artifacts.ExampleStatistics,
                                   artifacts=[statistics_artifact])
        # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils.
        stats_options_json = stats_options.to_json() if stats_options else None
        spec = StatisticsGenSpec(examples=examples,
                                 schema=schema,
                                 stats_options_json=stats_options_json,
                                 statistics=output)
        super(StatisticsGen, self).__init__(spec=spec,
                                            instance_name=instance_name,
                                            enable_cache=enable_cache)