Ejemplo n.º 1
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    # KerasTuner generates tuning state (e.g., oracle, trials) to working dir.
    working_dir = self._get_tmp_dir()

    train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train')
    eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    tuner_fn = self._GetTunerFn(exec_properties)
    tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path),
                          io_utils.all_files_pattern(eval_path), schema)
    tuner = tuner_spec.tuner

    tuner.search_space_summary()
    # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1.
    # TODO(jyzhao): make epochs configurable.
    tuner.search(
        tuner_spec.train_dataset,
        epochs=5,
        validation_data=tuner_spec.eval_dataset)
    tuner.results_summary()

    best_hparams = tuner.oracle.get_best_trials(
        1)[0].hyperparameters.get_config()
    best_hparams_path = os.path.join(
        artifact_utils.get_single_uri(output_dict['study_best_hparams_path']),
        _DEFAULT_FILE_NAME)
    io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams))
    absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
Ejemplo n.º 2
0
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text, Any],
                       working_dir: Text = None) -> FnArgs:
  """Get common args of training and tuning."""
  train_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'train'))
  ]
  eval_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'eval'))
  ]

  if input_dict.get(constants.TRANSFORM_GRAPH_KEY):
    transform_graph_path = artifact_utils.get_single_uri(
        input_dict[constants.TRANSFORM_GRAPH_KEY])
  else:
    transform_graph_path = None

  if input_dict.get(constants.SCHEMA_KEY):
    schema_path = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[constants.SCHEMA_KEY]))
  else:
    schema_path = None

  train_args = trainer_pb2.TrainArgs()
  eval_args = trainer_pb2.EvalArgs()
  json_format.Parse(exec_properties[constants.TRAIN_ARGS_KEY], train_args)
  json_format.Parse(exec_properties[constants.EVAL_ARGS_KEY], eval_args)

  # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
  # num_steps=None.  Conversion of the proto to python will set the default
  # value of an int as 0 so modify the value here.  Tensorflow will raise an
  # error if num_steps <= 0.
  train_steps = train_args.num_steps or None
  eval_steps = eval_args.num_steps or None

  # TODO(b/156929910): Refactor Trainer to be consistent with empty or None
  #                    custom_config handling.
  custom_config = json_utils.loads(
      exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null'))

  return FnArgs(
      working_dir=working_dir,
      train_files=train_files,
      eval_files=eval_files,
      train_steps=train_steps,
      eval_steps=eval_steps,
      schema_path=schema_path,
      transform_graph_path=transform_graph_path,
      custom_config=custom_config,
  )
Ejemplo n.º 3
0
 def testGetFromSingleList(self):
   """Test various retrieval utilities on a single list of Artifact."""
   artifacts = [standard_artifacts.Examples()]
   artifacts[0].uri = '/tmp/evaluri'
   artifacts[0].split_names = '["eval"]'
   self.assertEqual(artifacts[0],
                    artifact_utils.get_single_instance(artifacts))
   self.assertEqual('/tmp/evaluri', artifact_utils.get_single_uri(artifacts))
   self.assertEqual('/tmp/evaluri/eval',
                    artifact_utils.get_split_uri(artifacts, 'eval'))
   with self.assertRaises(ValueError):
     artifact_utils.get_split_uri(artifacts, 'train')
Ejemplo n.º 4
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Copy the input_data to the output_data.

    For this example that is all that the Executor does.  For a different
    custom component, this is where the real functionality of the component
    would be included.

    This component both reads and writes Examples, but a different component
    might read and write artifacts of other types.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of type `standard_artifacts.Examples` which will
          often contain two splits, 'train' and 'eval'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output_data: A list of type `standard_artifacts.Examples` which will
          usually contain the same splits as input_data.
      exec_properties: A dict of execution properties, including:
        - name: Optional unique name. Necessary iff multiple Hello components
          are declared in the same pipeline.

    Returns:
      None

    Raises:
      OSError and its subclasses
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        input_artifact = artifact_utils.get_single_instance(
            input_dict['input_data'])
        output_artifact = artifact_utils.get_single_instance(
            output_dict['output_data'])
        output_artifact.split_names = input_artifact.split_names

        split_to_instance = {}

        for split in json.loads(input_artifact.split_names):
            uri = artifact_utils.get_split_uri([input_artifact], split)
            split_to_instance[split] = uri

        for split, instance in split_to_instance.items():
            input_dir = instance
            output_dir = artifact_utils.get_split_uri([output_artifact], split)
            for filename in tf.io.gfile.listdir(input_dir):
                input_uri = os.path.join(input_dir, filename)
                output_uri = os.path.join(output_dir, filename)
                io_utils.copy_file(src=input_uri,
                                   dst=output_uri,
                                   overwrite=True)
Ejemplo n.º 5
0
  def testGetFromSplits(self):
    """Test various retrieval utilities on a list of split Artifact."""
    artifacts = [standard_artifacts.Examples()]
    artifacts[0].uri = '/tmp'
    artifacts[0].split_names = artifact_utils.encode_split_names(
        ['train', 'eval'])

    self.assertEqual(artifacts[0].split_names, '["train", "eval"]')

    self.assertIs(artifact_utils.get_single_instance(artifacts), artifacts[0])
    self.assertEqual('/tmp', artifact_utils.get_single_uri(artifacts))
    self.assertEqual('/tmp/train',
                     artifact_utils.get_split_uri(artifacts, 'train'))
    self.assertEqual('/tmp/eval',
                     artifact_utils.get_split_uri(artifacts, 'eval'))
Ejemplo n.º 6
0
 def test_get_from_single_list(self):
   """Test various retrieval utilities on a single list of Artifact."""
   single_list = [artifact.Artifact('MyTypeName', split='eval')]
   single_list[0].uri = '/tmp/evaluri'
   self.assertEqual(single_list[0],
                    artifact_utils.get_single_instance(single_list))
   self.assertEqual('/tmp/evaluri', artifact_utils.get_single_uri(single_list))
   self.assertEqual(single_list[0],
                    artifact_utils._get_split_instance(single_list, 'eval'))
   self.assertEqual('/tmp/evaluri',
                    artifact_utils.get_split_uri(single_list, 'eval'))
   with self.assertRaises(ValueError):
     artifact_utils._get_split_instance(single_list, 'train')
   with self.assertRaises(ValueError):
     artifact_utils.get_split_uri(single_list, 'train')
Ejemplo n.º 7
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Args:
            input_dict:
            output_dict:
            exec_properties:
        """
        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)
        data_step: BaseDataStep = c(**args)

        # Get output split path
        examples_artifact = artifact_utils.get_single_instance(
            output_dict[DATA_SPLIT_NAME])
        split_names = [DATA_SPLIT_NAME]
        examples_artifact.split_names = artifact_utils.encode_split_names(
            split_names)
        output_split_path = artifact_utils.get_split_uri([examples_artifact],
                                                         DATA_SPLIT_NAME)

        with self._make_beam_pipeline() as p:
            (p
             | data_step.read_from_source()
             # | data_step.convert_to_dict()
             | WriteToTFRecord(data_step.schema, output_split_path))
Ejemplo n.º 8
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)
        logging.info('Validating schema against the computed statistics.')

        split_uris: List[Text] = []
        for artifact in input_dict[executor.STATISTICS_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        label_inputs = {
            labels.STATS:
            tfdv.load_statistics(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_split_uri(
                        input_dict[executor.STATISTICS_KEY], split_uris[0]))),
            labels.SCHEMA:
            io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[executor.SCHEMA_KEY])))
        }
        output_uri = artifact_utils.get_single_uri(
            output_dict[executor.ANOMALIES_KEY])
        label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
        self._Validate(label_inputs, label_outputs)
        logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Ejemplo n.º 9
0
    def _provide_schema(self, input_dict,
                        exec_properties) -> schema_pb2.Schema:
        """Generates schema from either schema or statistics."""
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        stats = input_dict.get('stats') or input_dict.get('statistics')
        schema = input_dict.get('schema')

        if bool(stats) == bool(schema):
            raise ValueError(
                'Exactly only one of schema or stats must be provided')

        if schema:
            schema_uri = artifact_utils.get_single_uri(schema)
            absl.logging.info('Schema is provided. Reading from %s.' %
                              schema_uri)
            schema_reader = io_utils.SchemaReader()
            try:
                return schema_reader.read(
                    os.path.join(schema_uri, _DEFAULT_FILE_NAME))

            except tf.errors.NotFoundError:
                raise ValueError(
                    'Schema is provided, but failed to read from %s.' %
                    schema_uri)

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(stats, 'train'))
        infer_feature_shape = exec_properties['infer_feature_shape']
        return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                 infer_feature_shape)
Ejemplo n.º 10
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict['stats'], 'train'))
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        infer_feature_shape = exec_properties['infer_feature_shape']
        absl.logging.info('Infering schema from statistics.')
        schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                   infer_feature_shape)
        io_utils.write_pbtxt_file(output_uri, schema)
        absl.logging.info('Schema written to %s.' % output_uri)
Ejemplo n.º 11
0
    def _run_model_inference(
        self,
        output_example_spec: bulk_inferrer_pb2.OutputExampleSpec,
        examples: List[types.Artifact],
        output_examples: Optional[types.Artifact],
        inference_endpoint: model_spec_pb2.InferenceSpecType,
        inferrer_step: BaseInferrer,
    ) -> None:
        """Runs model inference on given examples data.

        Args:
          output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance.
          examples: List of `standard_artifacts.Examples` artifacts.
          output_examples: Optional output `standard_artifacts.Examples`
          artifact.
          inference_endpoint: Model inference endpoint.
          inferrer_step: Inferrer step supplied in the infer pipeline config.
        """

        # TODO[LOW]: Rewrite this since there is only one split in the
        #  DataGen output
        example_uris = {}
        for example_artifact in examples:
            for split in artifact_utils.decode_split_names(
                    example_artifact.split_names):
                example_uris[split] = artifact_utils.get_split_uri(
                    [example_artifact], split)

        output_examples.split_names = artifact_utils.encode_split_names(
            sorted(example_uris.keys()))

        with self._make_beam_pipeline() as pipeline:
            for split, example_uri in example_uris.items():
                output_examples_split_uri = artifact_utils.get_split_uri(
                    [output_examples], split)
                inferrer_step.set_output_uri(output_examples_split_uri)
                logging.info('Path of output examples split `%s` is %s.',
                             split, output_examples_split_uri)
                _ = (pipeline
                     | 'RunInference[{}]'.format(split) >> _RunInference(
                         example_uri, inference_endpoint)
                     | 'ConvertToDict[{}]'.format(split) >> beam.Map(
                         convert_to_dict, output_example_spec)
                     | 'WriteOutput[{}]'.format(split) >>
                     inferrer_step.write_inference_results())

            logging.info('Output examples written to %s.', output_examples.uri)
Ejemplo n.º 12
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
        if 'model_exports' not in input_dict:
            raise ValueError('\'model_exports\' is missing in input dict.')
        if 'examples' not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')
        if 'output' not in output_dict:
            raise ValueError('\'output\' is missing in output dict.')

        self._log_startup(input_dict, output_dict, exec_properties)

        # Extract input artifacts
        model_exports_uri = artifact_utils.get_single_uri(
            input_dict['model_exports'])

        feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
        json_format.Parse(exec_properties['feature_slicing_spec'],
                          feature_slicing_spec)
        slice_spec = self._get_slice_spec_from_feature_slicing_spec(
            feature_slicing_spec)

        output_uri = artifact_utils.get_single_uri(output_dict['output'])

        eval_model_path = path_utils.eval_model_path(model_exports_uri)

        tf.logging.info('Using {} for model eval.'.format(eval_model_path))
        eval_shared_model = tfma.default_eval_shared_model(
            eval_saved_model_path=eval_model_path)

        tf.logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            # pylint: disable=expression-not-assigned
            (pipeline
             | 'ReadData' >>
             beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern(
                 artifact_utils.get_split_uri(input_dict['examples'], 'eval')))
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=eval_shared_model,
                 slice_spec=slice_spec,
                 output_path=output_uri))
        tf.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))
Ejemplo n.º 13
0
def parse_statistics(split_name: Text,
                     statistics: List[Artifact]) -> Dict[Text, int]:
    stats_uri = io_utils.get_only_uri_in_dir(
        artifact_utils.get_split_uri(statistics, split_name))

    stats = tfdv.load_statistics(stats_uri)

    return stats
Ejemplo n.º 14
0
    def ReadExamplesArtifact(self,
                             examples: types.Artifact,
                             num_examples: int,
                             split_name: Optional[Text] = None):
        """Read records from Examples artifact.

    Currently it assumes Examples artifact contains serialized tf.Example in
    gzipped TFRecord files.

    Args:
      examples: `Examples` artifact.
      num_examples: Number of examples to read. If the specified value is larger
          than the actual number of examples, all examples would be read.
      split_name: Name of the split to read from the Examples artifact.

    Raises:
      RuntimeError: If read twice.
    """
        if self._records:
            raise RuntimeError('Cannot read records twice.')

        if num_examples < 1:
            raise ValueError('num_examples < 1 (got {})'.format(num_examples))

        available_splits = artifact_utils.decode_split_names(
            examples.split_names)
        if not available_splits:
            raise ValueError(
                'No split_name is available in given Examples artifact.')
        if split_name is None:
            split_name = available_splits[0]
        if split_name not in available_splits:
            raise ValueError(
                'No split_name {}; available split names: {}'.format(
                    split_name, ', '.join(available_splits)))

        # ExampleGen generates artifacts under each split_name directory.
        glob_pattern = os.path.join(
            artifact_utils.get_split_uri([examples], split_name), '*')
        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples=[examples],
            telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            raw_record_column_name=_RAW_RECORDS_COLUMN)
        filenames = fileio.glob(glob_pattern)
        if not filenames:
            raise ValueError(
                'Unable to find examples matching {}.'.format(glob_pattern))

        self._payload_format = examples_utils.get_payload_format(examples)
        tfxio = tfxio_factory(filenames)

        self._ReadFromDataset(
            tfxio.TensorFlowDataset(
                dataset_options.TensorFlowDatasetOptions(
                    batch_size=num_examples)))
Ejemplo n.º 15
0
    def Do(
        self,
        input_dict: Dict[Text, List[types.Artifact]],
        output_dict: Dict[Text, List[types.Artifact]],
        exec_properties: Dict[Text, Any],
    ) -> None:
        """Take input data source and generates serialized data splits.

    The output is intended to be serialized tf.train.Examples or
    tf.train.SequenceExamples protocol buffer in gzipped TFRecord format,
    but subclasses can choose to override to write to any serialized records
    payload into gzipped TFRecord as specified, so long as downstream
    component can consume it. The format of payload is added to
    `payload_format` custom property of the output Example artifact.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: splits of serialized records.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input_base: an external directory containing the data files.
        - input_config: JSON string of example_gen_pb2.Input instance,
          providing input configuration.
        - output_config: JSON string of example_gen_pb2.Output instance,
          providing output configuration.
        - output_data_format: Payload format of generated data in output
          artifact, one of example_gen_pb2.PayloadFormat enum.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        logging.info('Generating examples.')
        with self._make_beam_pipeline() as pipeline:
            example_splits = self.GenerateExamplesByBeam(
                pipeline, exec_properties)

            # pylint: disable=expression-not-assigned, no-value-for-parameter
            for split_name, example_split in example_splits.items():
                (example_split
                 | 'WriteSplit[{}]'.format(split_name) >> _WriteSplit(
                     artifact_utils.get_split_uri(
                         output_dict[utils.EXAMPLES_KEY], split_name)))
            # pylint: enable=expression-not-assigned, no-value-for-parameter

        output_payload_format = exec_properties.get(
            utils.OUTPUT_DATA_FORMAT_KEY)
        if output_payload_format:
            for output_examples_artifact in output_dict[utils.EXAMPLES_KEY]:
                output_examples_artifact.set_string_custom_property(
                    utils.PAYLOAD_FORMAT_PROPERTY_NAME,
                    example_gen_pb2.PayloadFormat.Name(output_payload_format))
        logging.info('Examples generated.')
Ejemplo n.º 16
0
 def _resolve_artifact_uri_operator(
         self, op: placeholder_pb2.ArtifactUriOperator) -> str:
     """Evaluates the artifact URI operator."""
     resolved_artifact = self.resolve(op.expression)
     if not isinstance(resolved_artifact, artifact.Artifact):
         raise ValueError("ArtifactUriOperator expects the expression "
                          "to evaluate to an artifact. "
                          f"Got {type(resolved_artifact)}")
     if op.split:
         return artifact_utils.get_split_uri([resolved_artifact], op.split)
     else:
         return resolved_artifact.uri
Ejemplo n.º 17
0
  def test_get_from_split_list(self):
    """Test various retrieval utilities on a list of split Artifact."""
    split_list = []
    for split in ['train', 'eval']:
      instance = artifact.Artifact('MyTypeName', split=split)
      instance.uri = '/tmp/' + split
      split_list.append(instance)

    with self.assertRaises(ValueError):
      artifact_utils.get_single_instance(split_list)

    with self.assertRaises(ValueError):
      artifact_utils.get_single_uri(split_list)

    self.assertEqual(split_list[0],
                     artifact_utils._get_split_instance(split_list, 'train'))
    self.assertEqual('/tmp/train',
                     artifact_utils.get_split_uri(split_list, 'train'))
    self.assertEqual(split_list[1],
                     artifact_utils._get_split_instance(split_list, 'eval'))
    self.assertEqual('/tmp/eval',
                     artifact_utils.get_split_uri(split_list, 'eval'))
Ejemplo n.º 18
0
 def display(self, artifact: types.Artifact):
   from IPython.core.display import display  # pylint: disable=g-import-not-at-top
   from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
   for split in artifact_utils.decode_split_names(artifact.split_names):
     display(HTML('<div><b>%r split:</b></div><br/>' % split))
     stats_path = io_utils.get_only_uri_in_dir(
         artifact_utils.get_split_uri([artifact], split))
     if artifact_utils.is_artifact_version_older_than(
         artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
       stats = tfdv.load_statistics(stats_path)
     else:
       stats = tfdv.load_stats_binary(stats_path)
     tfdv.visualize_statistics(stats)
Ejemplo n.º 19
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)
        tokenizer_step: BaseTokenizer = c(**args)

        tokenizer_location = artifact_utils.get_single_uri(
            output_dict["tokenizer"])

        split_uris, split_names, all_files = [], [], []
        for artifact in input_dict["examples"]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_names.append(split)
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
                all_files += path_utils.list_dir(uri)

        # Get output split path
        output_examples = artifact_utils.get_single_instance(
            output_dict["output_examples"])
        output_examples.split_names = artifact_utils.encode_split_names(
            split_names)

        if not tokenizer_step.skip_training:
            tokenizer_step.train(files=all_files)

            tokenizer_step.save(output_dir=tokenizer_location)

        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                input_uri = io_utils.all_files_pattern(uri)

                _ = (p
                     | 'ReadData.' + split >> beam.io.ReadFromTFRecord(
                            file_pattern=input_uri)
                     | "ParseTFExFromString." + split >> beam.Map(
                            tf.train.Example.FromString)
                     | "AddTokens." + split >> beam.Map(
                            append_tf_example,
                            tokenizer_step=tokenizer_step)
                     | 'Serialize.' + split >> beam.Map(
                            lambda x: x.SerializeToString())
                     | 'WriteSplit.' + split >> WriteSplit(
                            get_split_uri(
                                output_dict["output_examples"],
                                split)))
Ejemplo n.º 20
0
    def _run_sampling(self, example_uris: Mapping[Text, Text], to_key_fn: Text,
                      output_artifact: Artifact, samples_per_key: int) -> None:
        """Runs stratified sampling on given example data.
    Args:
      example_uris: Mapping of example split name to example uri.
      to_key_fn: function to convert an example to a key
      output_artifact: Output artifact.
      samples_per_key: number of examples to keep per value of the key.
    Returns:
      None
    """

        d = {}
        exec(to_key_fn, globals(), d)  # how ugly is that?
        to_key = d['to_key']

        def to_keyed_value(m):
            return to_key(m), m

        with self._make_beam_pipeline() as pipeline:
            for split_name, example_uri in example_uris.items():
                data_list = [
                    (pipeline | 'ReadData[{}]'.format(split_name) >>
                     beam.io.ReadFromTFRecord(
                         file_pattern=io_utils.all_files_pattern(example_uri)))
                ]

                dest_path = os.path.join(
                    artifact_utils.get_split_uri([output_artifact],
                                                 split_name),
                    _STRATIFIED_EXAMPLES_FILE_PREFIX)

                _ = ([data for data in data_list]
                     | 'FlattenExamples ({})'.format(split_name) >>
                     beam.Flatten(pipeline=pipeline)
                     | 'ParseExamples ({})'.format(split_name) >> beam.Map(
                         tf.train.Example.FromString)
                     |
                     'Key ({})'.format(split_name) >> beam.Map(to_keyed_value)
                     | 'Sample per key ({})'.format(split_name) >>
                     beam.combiners.Sample.FixedSizePerKey(samples_per_key)
                     | 'Values ({})'.format(split_name) >> beam.Values()
                     | 'Flatten lists ({})'.format(split_name) >>
                     beam.FlatMap(lambda elements: elements)
                     | 'WriteStratifiedSamples ({})'.format(split_name) >>
                     beam.io.WriteToTFRecord(dest_path,
                                             file_name_suffix='.gz',
                                             coder=beam.coders.ProtoCoder(
                                                 tf.train.Example)))
                logging.info('Sampling result written to %s.', dest_path)
Ejemplo n.º 21
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_uris = []
        for artifact in input_dict['input_data']:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
Ejemplo n.º 22
0
def copy_over(input_artifact, output_artifact, splits_to_copy):
    """
  Copy data from specified splits
  Args:
    input_artifact: location where the input splits are
    output_artifact: location where to copy them
    splits_to_copy: list of split names to copy
  Returns:
    None
  """
    split_to_instance = {}

    for split in splits_to_copy:
        uri = artifact_utils.get_split_uri(input_artifact, split)
        split_to_instance[split] = uri

    for split, instance in split_to_instance.items():
        input_dir = instance
        output_dir = artifact_utils.get_split_uri([output_artifact], split)
        for filename in tf.io.gfile.listdir(input_dir):
            input_uri = os.path.join(input_dir, filename)
            output_uri = os.path.join(output_dir, filename)
            io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
Ejemplo n.º 23
0
 def display(self, artifact: types.Artifact):
   from IPython.core.display import display  # pylint: disable=g-import-not-at-top
   from IPython.core.display import HTML  # pylint: disable=g-import-not-at-top
   for split in artifact_utils.decode_split_names(artifact.split_names):
     display(HTML('<div><b>%r split:</b></div><br/>' % split))
     anomalies_path = io_utils.get_only_uri_in_dir(
         artifact_utils.get_split_uri([artifact], split))
     if artifact_utils.is_artifact_version_older_than(
         artifact, artifact_utils._ARTIFACT_VERSION_FOR_ANOMALIES_UPDATE):  # pylint: disable=protected-access
       anomalies = tfdv.load_anomalies_text(anomalies_path)
     else:
       anomalies = anomalies_pb2.Anomalies()
       anomalies_bytes = io_utils.read_bytes_file(anomalies_path)
       anomalies.ParseFromString(anomalies_bytes)
     tfdv.display_anomalies(anomalies)
Ejemplo n.º 24
0
    def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
                   output_dict: Dict[Text, List[types.Artifact]],
                   exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs:
        results = super(ZenMLTrainerExecutor, self)._GetFnArgs(input_dict,
                                                               output_dict,
                                                               exec_properties)
        # TODO[LOW]: fix the fixed eval split
        output_artifact = artifact_utils.get_single_instance(
            output_dict[constants.TEST_RESULTS])
        output_artifact.split_names = artifact_utils.encode_split_names(
            ['eval'])
        test_results = artifact_utils.get_split_uri(
            [output_artifact], 'eval')

        results.test_results = test_results

        return results
Ejemplo n.º 25
0
Archivo: executor.py Proyecto: zvrr/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of type `standard_artifacts.ExampleStatistics` which
          should contain the 'eval' split. Stats on other splits are ignored.
        - schema: A list of type `standard_artifacts.Schema` which should
          contain a single schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        absl.logging.info('Validating schema against the computed statistics.')
        label_inputs = {
            labels.STATS:
            tfdv.load_statistics(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_split_uri(input_dict[STATISTICS_KEY],
                                                 'eval'))),
            labels.SCHEMA:
            io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
        }
        output_uri = artifact_utils.get_single_uri(output_dict[ANOMALIES_KEY])
        label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
        self._Validate(label_inputs, label_outputs)
        absl.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Take input data source and generates TF Example splits.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: splits of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input: JSON string of example_gen_pb2.Input instance, providing input
          configuration.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        absl.logging.info('Generating examples.')
        with self._make_beam_pipeline() as pipeline:
            example_splits = self.GenerateExamplesByBeam(
                pipeline, input_dict, exec_properties)

            # pylint: disable=expression-not-assigned, no-value-for-parameter
            for split_name, example_split in example_splits.items():
                (example_split
                 | 'WriteSplit[{}]'.format(split_name) >> _WriteSplit(
                     artifact_utils.get_split_uri(output_dict['examples'],
                                                  split_name)))
            # pylint: enable=expression-not-assigned, no-value-for-parameter

        absl.logging.info('Examples generated.')
Ejemplo n.º 27
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = artifact_utils.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Ejemplo n.º 28
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
Ejemplo n.º 29
0
    def Do(self, input_dict: Dict[Text, List[Artifact]],
           output_dict: Dict[Text, List[Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        split_uris: List[Text] = []
        for artifact in input_dict[executor.EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        self._log_startup(input_dict, output_dict, exec_properties)
        data_uris = []
        for split in split_uris:
            data_uris.append(
                artifact_utils.get_split_uri(input_dict[executor.EXAMPLES_KEY],
                                             split))

        schema_file = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[executor.SCHEMA_KEY]))
        transform_output = artifact_utils.get_single_uri(
            output_dict[executor.TRANSFORM_GRAPH_KEY])
        transformed_data_uris = []
        for split in split_uris:
            transformed_data_uris.append(
                artifact_utils.get_split_uri(
                    output_dict[executor.TRANSFORMED_EXAMPLES_KEY], split))
        temp_path = os.path.join(transform_output,
                                 executor._TEMP_DIR_IN_TRANSFORM_OUTPUT)
        logging.debug('Using temp path %s for tft.beam', temp_path)

        def _GetCachePath(label, params_dict):
            if label not in params_dict:
                return None
            else:
                return artifact_utils.get_single_uri(params_dict[label])

        label_inputs = {
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
            labels.ANALYZE_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(data_uris[0]),
            labels.ANALYZE_PATHS_FILE_FORMATS_LABEL:
            labels.FORMAT_TFRECORD,
            labels.TRANSFORM_DATA_PATHS_LABEL:
            [io_utils.all_files_pattern(uri) for uri in data_uris],
            labels.TRANSFORM_PATHS_FILE_FORMATS_LABEL:
            [labels.FORMAT_TFRECORD for uri in data_uris],
            labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
            labels.MODULE_FILE:
            exec_properties.get('module_file', None),
            labels.PREPROCESSING_FN:
            exec_properties.get('preprocessing_fn', None),
            # TODO(b/149754658): switch to True once the TFXIO integration is
            # complete.
            labels.USE_TFXIO_LABEL:
            False,
        }
        cache_input = _GetCachePath('cache_input_path', input_dict)
        if cache_input is not None:
            label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL:
            transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
                os.path.join(uri,
                             executor._DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
                for uri in transformed_data_uris
            ],
            labels.TEMP_OUTPUT_LABEL:
            str(temp_path),
        }
        cache_output = _GetCachePath('cache_output_path', output_dict)
        if cache_output is not None:
            label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        logging.debug('Cleaning up temp path %s on executor success',
                      temp_path)
        io_utils.delete_dir(temp_path)
Ejemplo n.º 30
0
    def Do(self, input_dict: Dict[Text, List[Artifact]],
           output_dict: Dict[Text, List[Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Generate MetaFeatures for meta training datasets.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - statistics: output from StatisticsGen component.
      output_dict: Output dict from key to a list of artifacts.
      exec_properties: A dict of execution properties
    """

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], 'train'))

        stats = tfdv.load_statistics(train_stats_uri)

        if len(stats.datasets) != 1:
            raise ValueError(
                'DatasetFeatureStatisticsList proto contains multiple datasets. Only '
                'one dataset is currently supported.')
        stats = stats.datasets[0]

        num_float_features = 0
        num_int_features = 0
        num_categorical_features = 0
        for feature in stats.features:

            name = feature.name

            # For structured fields, name is set by path and is not in the name
            # attribute.
            if not name:
                name = feature.path.step[0]
            logging.info('Feature Name: %s', name)

            if statistics_pb2.FeatureNameStatistics.FLOAT == feature.type:
                num_float_features += 1
            elif statistics_pb2.FeatureNameStatistics.INT == feature.type:
                num_int_features += 1
            else:
                num_categorical_features += 1

        metafeature_dict = {
            'num_examples': stats.num_examples,
            'num_int_features': num_int_features,
            'num_float_features': num_float_features,
            'num_categorical_features': num_categorical_features,
        }

        metafeature_dict['metafeature'] = [
            stats.num_examples, num_int_features, num_float_features,
            num_categorical_features
        ]

        metafeature_path = os.path.join(
            artifact_utils.get_single_uri(output_dict[METAFEATURES_KEY]),
            artifacts.MetaFeatures.DEFAULT_FILE_NAME)

        io_utils.write_string_file(metafeature_path,
                                   json.dumps(metafeature_dict))
        logging.info('MetaFeature saved at %s', metafeature_path)