Ejemplo n.º 1
0
    def testKeepalive(self):
        count = Count()
        shared_handle = shared.Shared()
        other_shared_handle = shared.Shared()

        def dummy_acquire_fn():
            return None

        def acquire_fn():
            return Marker(count)

        p1 = shared_handle.acquire(acquire_fn)
        self.assertEquals(1, count.get_total())
        self.assertEquals(1, count.get_active())
        del p1
        gc.collect()
        # Won't be garbage collected, because of the keep-alive
        self.assertEquals(1, count.get_active())

        # Reacquire.
        p2 = shared_handle.acquire(acquire_fn)
        self.assertEquals(1, count.get_total())  # No reinitialisation.
        self.assertEquals(1, count.get_active())

        # Get rid of the keepalive
        other_shared_handle.acquire(dummy_acquire_fn)
        del p2
        gc.collect()
        self.assertEquals(0, count.get_active())
Ejemplo n.º 2
0
def BuildDiagnosticTable(
        # pylint: disable=invalid-name
        examples,
        eval_saved_model_path,
        extractors=None,
        desired_batch_size=None):
    """Build diagnostics for the spacified EvalSavedModel and example collection.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_saved_model_path: Path to EvalSavedModel. This directory should contain
      the saved_model.pb file.
    extractors: Optional list of Extractors to execute prior to slicing and
      aggregating the metrics. If not provided, a default set will be run.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.

  Returns:
    PCollection of ExampleAndExtracts
  """

    if not extractors:
        extractors = [
            PredictExtractor(eval_saved_model_path, None, shared.Shared(),
                             desired_batch_size),
            types.Extractor(stage_name='ExtractFeatures',
                            ptransform=feature_extractor.ExtractFeatures()),
        ]
    return (
        examples
        | 'ToExampleAndExtracts' >>
        beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))
        | Extract(extractors=extractors))
Ejemplo n.º 3
0
 def __init__(
         self, model_agnostic_config: agnostic_predict.ModelAgnosticConfig
 ) -> None:
     self._model_agnostic_config = model_agnostic_config
     self._shared_handle = shared.Shared()
     self._model_load_seconds = beam.metrics.Metrics.distribution(
         _METRICS_NAMESPACE, 'model_load_seconds')
Ejemplo n.º 4
0
def BuildDiagnosticTable(
        # pylint: disable=invalid-name
        examples,
        eval_saved_model_path,
        desired_batch_size=None):
    """Build diagnostics for the spacified EvalSavedModel and example collection.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_saved_model_path: Path to EvalSavedModel. This directory should contain
      the saved_model.pb file.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.

  Returns:
    PCollection of ExampleAndExtracts
  """
    return (
        examples
        | 'ToExampleAndExtracts' >>
        beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))
        | 'Predict' >> predict_extractor.TFMAPredict(
            eval_saved_model_path,
            add_metrics_callbacks=None,
            shared_handle=shared.Shared(),
            desired_batch_size=desired_batch_size)
        | 'ExtractFeatures' >> feature_extractor.ExtractFeatures())
Ejemplo n.º 5
0
def _ExtractOutput(  # pylint: disable=invalid-name
        aggregate_result, eval_saved_model_path, add_metrics_callbacks):
    return aggregate_result | beam.ParDo(
        _ExtractOutputDoFn(eval_saved_model_path=eval_saved_model_path,
                           add_metrics_callbacks=add_metrics_callbacks,
                           shared_handle=shared.Shared())).with_outputs(
                               _ExtractOutputDoFn.OUTPUT_TAG_PLOTS,
                               main=_ExtractOutputDoFn.OUTPUT_TAG_METRICS)
Ejemplo n.º 6
0
    def testConcurrentCallsDeduped(self):
        # Test that only one among many calls to acquire will actually run the
        # initialisation function.

        count = Count()
        shared_handle = shared.Shared()
        other_shared_handle = shared.Shared()

        refs = []
        ref_lock = threading.Lock()

        def dummy_acquire_fn():
            return None

        def acquire_fn():
            time.sleep(1)
            return Marker(count)

        def thread_fn():
            p = shared_handle.acquire(acquire_fn)
            with ref_lock:
                refs.append(p)

        threads = []
        for _ in xrange(100):
            t = threading.Thread(target=thread_fn)
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        self.assertEquals(1, count.get_total())
        self.assertEquals(1, count.get_active())

        other_shared_handle.acquire(
            dummy_acquire_fn)  # Get rid of the keepalive

        with ref_lock:
            del refs[:]
        gc.collect()

        self.assertEquals(0, count.get_active())
Ejemplo n.º 7
0
    def expand(self, inputs):
        input_values, tensor_pcoll_mapping = (
            self._maybe_deep_copy_pcollection_inputs(inputs))

        saved_model_dir = (tensor_pcoll_mapping
                           | 'CreateSavedModelForAnalyzerInputs' >>
                           _ReplaceTensorsWithConstants(
                               self._unbound_saved_model_dir,
                               self._base_temp_dir, input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs' >> _BatchElements()
            | 'ComputeAnalyzerInputs' >> beam.ParDo(
                _RunMetaGraphDoFn(
                    self._input_schema,
                    self._serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # For each analyzer output, look up its input values (by tensor name)
        # and run the analyzer on these values.
        result = {}
        for analyzer_info in self._analyzer_infos:
            num_outputs = len(analyzer_info.output_infos)
            inputs = (analyzer_input_values
                      | 'ExtractInputs[%s]' % analyzer_info.attributes.name >>
                      beam.Map(lambda batch, keys: tuple(batch[key]
                                                         for key in keys),
                               keys=analyzer_info.input_tensor_names))
            ptransform = common.lookup_registered_ptransform(
                analyzer_info.attributes)
            output_pcolls = (
                (inputs, )
                | ptransform(num_outputs,
                             analyzer_info.attributes,
                             serialized_tf_config=self._serialized_tf_config,
                             base_temp_dir=self._base_temp_dir))

            if len(output_pcolls) != num_outputs:
                raise ValueError(
                    'Analyzer {} has {} outputs but its implementation produced {} '
                    'pcollections'.format(analyzer_info.name, num_outputs,
                                          len(output_pcolls)))

            for index, (output_pcoll, (name, is_asset)) in enumerate(
                    zip(output_pcolls, analyzer_info.output_infos)):
                result[name] = (output_pcoll
                                | 'WrapAsTensorValue[%s][%d]' %
                                (analyzer_info.attributes.name, index) >>
                                beam.Map(_TensorValue, is_asset))
        return result
Ejemplo n.º 8
0
    def testPredict(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_extracts = (
                pipeline
                | beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
                # however our aggregating functions do not use this interface.
                | beam.Map(
                    lambda x: types.ExampleAndExtracts(example=x, extracts={}))
                | 'Predict' >> predict_extractor.TFMAPredict(
                    eval_saved_model_path=eval_export_dir,
                    add_metrics_callbacks=None,
                    shared_handle=shared.Shared(),
                    desired_batch_size=3))

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    for item in got:
                        extracts_dict = item.extracts
                        self.assertTrue(extracts_dict.has_key('fpl'))
                        fpl = extracts_dict['fpl']
                        # Verify fpl contains features, probabilities, and correct labels.
                        self.assertIn('language', fpl.features)
                        self.assertIn('age', fpl.features)
                        self.assertIn('label', fpl.features)
                        self.assertIn('probabilities', fpl.predictions)
                        self.assertAlmostEqual(fpl.features['label'],
                                               fpl.labels['__labels'])
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
Ejemplo n.º 9
0
    def testMultiple(self):
        count = Count()
        shared_handle = shared.Shared()
        other_shared_handle = shared.Shared()

        def dummy_acquire_fn():
            return None

        def acquire_fn():
            return Marker(count)

        p = shared_handle.acquire(acquire_fn)
        other_shared_handle.acquire(
            dummy_acquire_fn)  # Get rid of the keepalive
        self.assertEquals(1, count.get_total())
        self.assertEquals(1, count.get_active())
        del p
        gc.collect()
        self.assertEquals(0, count.get_active())
        # Shared value should be garbage collected.

        # Acquiring multiple times only results in one initialisation
        p1 = shared_handle.acquire(acquire_fn)
        # Since shared value was released, expect a reinitialisation.
        self.assertEquals(2, count.get_total())
        self.assertEquals(1, count.get_active())
        p2 = shared_handle.acquire(acquire_fn)
        self.assertEquals(2, count.get_total())
        self.assertEquals(1, count.get_active())

        other_shared_handle.acquire(
            dummy_acquire_fn)  # Get rid of the keepalive

        # Check that shared object isn't destroyed if there's still a reference to
        # it.
        del p2
        gc.collect()
        self.assertEquals(1, count.get_active())

        del p1
        gc.collect()
        self.assertEquals(0, count.get_active())
Ejemplo n.º 10
0
    def expand(self, dataset_and_transform_fn):
        """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
        (input_values,
         input_metadata), (transform_fn,
                           output_metadata) = (dataset_and_transform_fn)

        # If exclude_outputs is set, update the output metadata.
        if self._exclude_outputs is not None:
            if isinstance(output_metadata,
                          beam_metadata_io.BeamDatasetMetadata):
                new_metadata = _remove_columns_from_metadata(
                    output_metadata.dataset_metadata, self._exclude_outputs)
                new_deferred_metadata = (
                    output_metadata.deferred_metadata
                    | 'RemoveColumms' >> beam.Map(
                        _remove_columns_from_metadata, self._exclude_outputs))
                output_metadata = beam_metadata_io.BeamDatasetMetadata(
                    new_metadata, new_deferred_metadata)
            else:
                output_metadata = _remove_columns_from_metadata(
                    output_metadata, self._exclude_outputs)

        serialized_tf_config = (
            common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
                self.pipeline.runner))

        output_instances = (
            input_values
            | 'Batch' >> _BatchElements()
            | 'Transform' >> beam.ParDo(
                _RunMetaGraphDoFn(
                    input_metadata.schema,
                    serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys(),
                    exclude_outputs=self._exclude_outputs),
                saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
            | 'ConvertAndUnbatch' >> beam.FlatMap(
                _convert_and_unbatch_to_instance_dicts,
                schema=output_metadata.schema,
                passthrough_keys=Context.get_passthrough_keys()))

        _clear_shared_state_after_barrier(self.pipeline, output_instances)

        return (output_instances, output_metadata)
Ejemplo n.º 11
0
def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll)))
Ejemplo n.º 12
0
 def __new__(cls,
             model_path,
             add_metrics_callbacks=None,
             example_weight_key=None,
             shared_handle=None):
     if not add_metrics_callbacks:
         add_metrics_callbacks = []
     if not shared_handle:
         shared_handle = shared.Shared()
     return super(EvalSharedModel,
                  cls).__new__(cls, model_path, add_metrics_callbacks,
                               example_weight_key, shared_handle)
Ejemplo n.º 13
0
    def expand(self, inputs):
        input_values, tensor_pcoll_mapping = inputs

        saved_model_dir = (tensor_pcoll_mapping
                           | 'CreateSavedModelForAnalyzerInputs' >>
                           _ReplaceTensorsWithConstants(
                               self._unbound_saved_model_dir,
                               self._base_temp_dir, input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs' >> _BatchElements()
            | 'ComputeAnalyzerInputs' >> beam.ParDo(
                _RunMetaGraphDoFn(
                    self._input_schema,
                    self._serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # For each analyzer output, look up its input values (by tensor name)
        # and run the analyzer on these values.
        result = {}
        for analyzer_info in self._analyzer_infos:
            temp_assets_dir = _make_unique_temp_dir(self._base_temp_dir)
            tf.gfile.MkDir(temp_assets_dir)
            output_pcolls = (
                analyzer_input_values
                | 'ExtractInputs[%s]' % analyzer_info.name >> beam.Map(
                    lambda batch, keys: [batch[key] for key in keys],
                    keys=analyzer_info.input_tensor_names)
                | 'Analyze[%s]' % analyzer_info.name >>
                analyzer_impls._AnalyzerImpl(analyzer_info.spec,
                                             temp_assets_dir))
            # pylint: enable=protected-access

            if len(output_pcolls) != len(analyzer_info.output_infos):
                raise ValueError(
                    'Analyzer {} has {} outputs but its implementation produced {} '
                    'pcollections'.format(analyzer_info.name,
                                          len(analyzer_info.output_infos),
                                          len(output_pcolls)))

            for index, (output_pcoll, (name, is_asset)) in enumerate(
                    zip(output_pcolls, analyzer_info.output_infos)):
                result[name] = (output_pcoll
                                | 'WrapAsTensorValue[%s][%d]' %
                                (analyzer_info.name, index) >> beam.Map(
                                    _TensorValue, is_asset))
        return result
Ejemplo n.º 14
0
def _Aggregate(  # pylint: disable=invalid-name
    slice_result,
    eval_saved_model_path,
    add_metrics_callbacks,
    desired_batch_size = None,
):
  return (slice_result
          | 'CombinePerKey' >> beam.CombinePerKey(
              _AggregateCombineFn(
                  eval_saved_model_path=eval_saved_model_path,
                  add_metrics_callbacks=add_metrics_callbacks,
                  shared_handle=shared.Shared(),
                  desired_batch_size=desired_batch_size)))
Ejemplo n.º 15
0
def _Predict(  # pylint: disable=invalid-name
        examples,
        eval_saved_model_path,
        desired_batch_size=None):
    batch_args = {}
    if desired_batch_size:
        batch_args = dict(min_batch_size=desired_batch_size,
                          max_batch_size=desired_batch_size)
    return (examples
            | 'Batch' >> beam.BatchElements(**batch_args)
            | beam.ParDo(
                _PredictionDoFn(eval_saved_model_path=eval_saved_model_path,
                                add_metrics_callbacks=None,
                                shared_handle=shared.Shared())))
Ejemplo n.º 16
0
    def testDifferentObjects(self):
        sequence = Sequence()

        def dummy_acquire_fn():
            return None

        first_handle = shared.Shared()
        second_handle = shared.Shared()
        dummy_handle = shared.Shared()

        f1 = first_handle.acquire(sequence.make_acquire_fn())
        s1 = second_handle.acquire(sequence.make_acquire_fn())

        self.assertEquals('sequence1', f1.get_name())
        self.assertEquals('sequence2', s1.get_name())

        f2 = first_handle.acquire(sequence.make_acquire_fn())
        s2 = second_handle.acquire(sequence.make_acquire_fn())

        # Check that the repeated acquisitions return the earlier objects
        self.assertEquals('sequence1', f2.get_name())
        self.assertEquals('sequence2', s2.get_name())

        # Release all references and force garbage-collection
        del f1
        del f2
        del s1
        del s2
        dummy_handle.acquire(dummy_acquire_fn)  # Get rid of the keepalive
        gc.collect()

        # Check that acquiring again after they're released gives new objects
        f3 = first_handle.acquire(sequence.make_acquire_fn())
        s3 = second_handle.acquire(sequence.make_acquire_fn())
        self.assertEquals('sequence3', f3.get_name())
        self.assertEquals('sequence4', s3.get_name())
Ejemplo n.º 17
0
 def __new__(cls,
             model_path=None,
             add_metrics_callbacks=None,
             include_default_metrics=True,
             example_weight_key=None,
             shared_handle=None,
             construct_fn=None):
     if not add_metrics_callbacks:
         add_metrics_callbacks = []
     if not shared_handle:
         shared_handle = shared.Shared()
     return super(EvalSharedModel,
                  cls).__new__(cls, model_path, add_metrics_callbacks,
                               include_default_metrics, example_weight_key,
                               shared_handle, construct_fn)
Ejemplo n.º 18
0
 def __new__(
     cls,
     model_path: Optional[Text] = None,
     add_metrics_callbacks: Optional[List[AddMetricsCallbackType]] = None,
     include_default_metrics: Optional[bool] = True,
     example_weight_key: Optional[Text] = None,
     additional_fetches: Optional[List[Text]] = None,
     shared_handle: Optional[shared.Shared] = None,
     construct_fn: Optional[Callable[..., Any]] = None):
   if not add_metrics_callbacks:
     add_metrics_callbacks = []
   if not shared_handle:
     shared_handle = shared.Shared()
   return super(EvalSharedModel, cls).__new__(
       cls, model_path, add_metrics_callbacks, include_default_metrics,
       example_weight_key, additional_fetches, shared_handle, construct_fn)
Ejemplo n.º 19
0
    def expand(self, dataset_and_transform_fn):
        """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
        (input_values, input_metadata), (transform_fn,
                                         (output_metadata,
                                          _)) = (dataset_and_transform_fn)

        # If exclude_outputs is set, update the output metadata, which will also
        # cause _RunMetaGraphDoFn not to create the excluded outputs.
        if self._exclude_outputs is not None:
            schema = output_metadata.schema
            output_metadata = dataset_metadata.DatasetMetadata(
                schema=dataset_schema.Schema({
                    key: column_schema
                    for key, column_schema in six.iteritems(
                        schema.column_schemas)
                    if key not in self._exclude_outputs
                }))

        def convert_and_unbatch(batch_dict):
            return impl_helper.to_instance_dicts(
                impl_helper.make_output_dict(output_metadata.schema,
                                             batch_dict))

        serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(
            self.pipeline.runner)
        output_instances = (
            input_values
            | 'Transform' >> beam.ParDo(
                _RunMetaGraphDoFn(input_metadata.schema,
                                  output_metadata.schema,
                                  serialized_tf_config,
                                  shared_graph_state_handle=shared.Shared(),
                                  exclude_outputs=self._exclude_outputs),
                saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
            | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch))

        _clear_shared_state_after_barrier(self.pipeline, output_instances)

        return (output_instances, output_metadata)
Ejemplo n.º 20
0
    def expand(self, inputs):
        # We don't deep_copy pcollections used for the first phase, or when
        # the user defined `Context` disables it.
        if self._phase > 0 and Context.get_use_deep_copy_optimization():
            # Obviates unnecessary data materialization when the input data source is
            # safe to read more than once.
            tf.logging.info('Deep copying inputs for phase: %d', self._phase)
            input_values = deep_copy.deep_copy(self._input_values_pcoll)
        else:
            input_values = self._input_values_pcoll

        return (input_values
                | 'BatchInputs' >> _BatchElements()
                | 'ApplySavedModel' >> beam.ParDo(
                    _RunMetaGraphDoFn(
                        self._input_schema,
                        self._serialized_tf_config,
                        shared_graph_state_handle=shared.Shared(),
                        passthrough_keys=Context.get_passthrough_keys()),
                    saved_model_dir=beam.pvalue.AsSingleton(inputs[0])))
Ejemplo n.º 21
0
 def __new__(cls,
             shared_handle: Optional[shared.Shared] = None,
             construct_fn: Optional[Callable[..., Any]] = None):
   if not shared_handle:
     shared_handle = shared.Shared()
   return super(ModelLoader, cls).__new__(cls, shared_handle, construct_fn)
Ejemplo n.º 22
0
def Evaluate(
    # pylint: disable=invalid-name
    examples,
    eval_saved_model_path,
    add_metrics_callbacks=None,
    slice_spec=None,
    desired_batch_size=None,
):
    """Evaluate the given EvalSavedModel on the given examples.

  This is for TFMA use only. Users should call tfma.EvaluateAndWriteResults
  instead of this function.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_saved_model_path: Path to EvalSavedModel. This directory should contain
      the saved_model.pb file.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph. The names of the metrics added by the callbacks
      should not conflict with existing metrics, or metrics added by other
      callbacks. See below for more details about what each callback should do.
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.

  More details on add_metrics_callbacks:

    Each add_metrics_callback should have the following prototype:
      def add_metrics_callback(features_dict, predictions_dict, labels_dict):

    Note that features_dict, predictions_dict and labels_dict are not
    necessarily dictionaries - they might also be Tensors, depending on what the
    model's eval_input_receiver_fn returns.

    It should create and return a metric_ops dictionary, such that
    metric_ops['metric_name'] = (value_op, update_op), just as in the Trainer.

    Short example:

    def add_metrics_callback(features_dict, predictions_dict, labels):
      metrics_ops = {}
      metric_ops['mean_label'] = tf.metrics.mean(labels)
      metric_ops['mean_probability'] = tf.metrics.mean(tf.slice(
        predictions_dict['probabilities'], [0, 1], [2, 1]))
      return metric_ops

  Returns:
    DoOutputsTuple. The tuple entries are
    PCollection of (slice key, metrics) and
    PCollection of (slice key, plot metrics).
  """
    if slice_spec is None:
        slice_spec = [slicer.SingleSliceSpec()]

    shared_handle = shared.Shared()

    # pylint: disable=no-value-for-parameter
    return (
        examples
        # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
        # however our aggregating functions do not use this interface.
        | 'ToExampleAndExtracts' >>
        beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))

        # Map function which loads and runs the eval_saved_model against every
        # example, yielding an types.ExampleAndExtracts containing a
        # FeaturesPredictionsLabels value (where key is 'fpl').
        | 'Predict' >> predict_extractor.TFMAPredict(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            shared_handle=shared_handle,
            desired_batch_size=desired_batch_size)

        # Input: one example fpl at a time
        # Output: one fpl example per slice key (notice that the example turns
        #         into n, replicated once per applicable slice key)
        | 'Slice' >> slice_api.Slice(slice_spec)

        # Each slice key lands on one shard where metrics are computed for all
        # examples in that shard -- the "map" and "reduce" parts of the
        # computation happen within this shard.
        # Output: Tuple[slicer.SliceKeyType, MetricVariablesType]
        |
        'Aggregate' >> _Aggregate(eval_saved_model_path=eval_saved_model_path,
                                  add_metrics_callbacks=add_metrics_callbacks,
                                  shared_handle=shared_handle,
                                  desired_batch_size=desired_batch_size)

        # Different metrics for a given slice key are brought together.
        | 'ExtractOutput' >> _ExtractOutput(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            shared_handle=shared_handle))
Ejemplo n.º 23
0
  def expand(self, dataset_and_transform_fn):
    """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
    (input_values, input_metadata), (transform_fn, output_metadata) = (
        dataset_and_transform_fn)

    # If exclude_outputs is set, update the output metadata.
    if self._exclude_outputs is not None:
      if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata):
        # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict.
        output_metadata, pcollections = output_metadata
        schema = output_metadata.schema
        # Update DatasetMetadata to remove excluded outputs
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))
        # Update pcollections to keep only pcollections that resolve futures in
        # the updated metadata.
        unresolved_future_names = set(
            future.name for future in output_metadata.substitute_futures({}))
        pcollections = {
            name: pcollection
            for name, pcollection in six.iteritems(pcollections)
            if name in unresolved_future_names
        }
        # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata
        output_metadata = beam_metadata_io.BeamDatasetMetadata(
            output_metadata, pcollections)
      else:
        schema = output_metadata.schema
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))

    serialized_tf_config = (
        common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            self.pipeline.runner))

    output_instances = (
        input_values
        | 'Batch' >> _BatchElements()
        | 'Transform' >> beam.ParDo(
            _RunMetaGraphDoFn(
                input_metadata.schema,
                serialized_tf_config,
                shared_graph_state_handle=shared.Shared(),
                passthrough_keys=Context.get_passthrough_keys(),
                exclude_outputs=self._exclude_outputs),
            saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
        | 'ConvertAndUnbatch' >> beam.FlatMap(
            _convert_and_unbatch_to_instance_dicts,
            schema=output_metadata.schema,
            passthrough_keys=Context.get_passthrough_keys()))

    _clear_shared_state_after_barrier(self.pipeline, output_instances)

    return (output_instances, output_metadata)
Ejemplo n.º 24
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    graph = tf.Graph()
    with graph.as_default():

      with tf.name_scope('inputs'):
        inputs = input_schema.as_batched_placeholders()
      # In order to avoid a bug where import_graph_def fails when the input_map
      # and return_elements of an imported graph are the same (b/34288791), we
      # avoid using the placeholder of an input column as an output of a graph.
      # We do this by applying tf.identity to all inputs of the
      # preprocessing_fn.  Note this applies at the level of raw tensors.
      outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

      # At this point we check that the preprocessing_fn has at least one
      # output. This is because if we allowed the output of preprocessing_fn to
      # be empty, we wouldn't be able to determine how many instances to
      # "unbatch" the output into.
      if not outputs:
        raise ValueError('The preprocessing function returned an empty dict')

      if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        raise ValueError(
            'The preprocessing function contained trainable variables '
            '{}'.format(
                graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)))

      # NOTE: it's important that create_phases is called directly after
      # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
      # collection which would break the logic in create_phases.
      phases = impl_helper.create_phases()

      # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
      # names to singleton PCollections containing a _TensorValue.  We compute
      # tensor_pcoll_mapping in phases, where at each phase we compute the
      # analyzers that are ready to run and update tensor_pcoll_mapping.
      tensor_pcoll_mapping = {}
      table_initializers = graph.get_collection_ref(
          tf.GraphKeys.TABLE_INITIALIZERS)
      original_table_initializers = list(table_initializers)
      del table_initializers[:]

      serialized_tf_config = (
          common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
              input_values.pipeline.runner))
      for level, phase in enumerate(phases):
        # Create a SavedModel that describes the mapping from the input data
        # to the inputs of the analyzers at this level.  The colum names of the
        # outputs are the tensor names of the analyzer inputs in the graph.
        # This graph has the anaylzer outputs computed so far replaced with
        # constants.
        analyzer_inputs = {}
        for analyzer in phase.analyzers:
          for input_tensor in analyzer.inputs:
            analyzer_inputs[input_tensor.name] = input_tensor
        table_initializers.extend(phase.table_initializers)
        unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, analyzer_inputs,
                               unbound_saved_model_dir)
        saved_model_dir = (
            tensor_pcoll_mapping
            | 'CreateSavedModelForAnalyzerInputs[%d]' % level >>
            _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir,
                                         input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements()
            | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
                _RunMetaGraphDoFn(
                    input_schema,
                    serialized_tf_config,
                    shared_graph_state_handle=shared.Shared(),
                    passthrough_keys=Context.get_passthrough_keys()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
        # map from tensor names to singleton PCollections of `_TensorValue`s.
        analyzer_outputs_dict = (
            analyzer_input_values
            | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(
                phase.analyzers, base_temp_dir))

        # Update the mapping for all analyzers.
        tensor_pcoll_mapping.update(analyzer_outputs_dict)

      del table_initializers[:]
      table_initializers.extend(original_table_initializers)
      saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(graph, inputs, outputs, saved_model_dir)
      transform_fn = (
          tensor_pcoll_mapping
          | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
              saved_model_dir, base_temp_dir, input_values.pipeline))

      # Infer metadata.  The metadata may contain Futures that refer to the
      # values of tensors in the graph.  In that case, the tensors must be
      # "constant" in that they don't depend on input data.  The tensors can
      # depend on analyzer outputs though.  This allows us to set metadata that
      # depends on analyzer outputs.
      #
      # We first extract the names of the tensors that are referenced by the
      # Futures, and then compute them by calling _ComputeScalarConstants with
      # the tensor-PCollection mapping representing the analyzer outputs.
      metadata = dataset_metadata.DatasetMetadata(
          schema=impl_helper.infer_feature_schema(outputs))

      deferred_metadata_tensor_names = {
          future.name
          for column_schema in metadata.schema.column_schemas.values()
          for future in column_schema.substitute_futures({})
      }
      name_pcoll_dict = (
          tensor_pcoll_mapping
          | 'ComputeTensorValues' >>
          _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir,
                               input_values.pipeline))
      full_metadata = beam_metadata_io.BeamDatasetMetadata(
          metadata, name_pcoll_dict)

      _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

      return transform_fn, full_metadata
Ejemplo n.º 25
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    # NOTE: it's important that create_phases is called directly after
    # run_preprocessing_fn, because we later mutate the graph's
    # TABLE_INITIALIZERS collection which would break the logic in
    # create_phases.
    graph, inputs, outputs = impl_helper.run_preprocessing_fn(
        self._preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)

    # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
    # names to singleton PCollections containing a _TensorValue.  We compute
    # tensor_pcoll_mapping in phases, where at each phase we compute the
    # analyzers that are ready to run and update tensor_pcoll_mapping.
    tensor_pcoll_mapping = {}
    table_initializers = graph.get_collection_ref(
        tf.GraphKeys.TABLE_INITIALIZERS)
    original_table_initializers = list(table_initializers)
    del table_initializers[:]

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            input_values.pipeline.runner))
    for level, phase in enumerate(phases):
      # Create a SavedModel that describes the mapping from the input data
      # to the inputs of the analyzers at this level.  The colum names of the
      # outputs are the tensor names of the analyzer inputs in the graph.  This
      # graph has the anaylzer outputs computed so far replaced with constants.
      analyzer_inputs = {}
      for analyzer in phase.analyzers:
        for input_tensor in analyzer.inputs:
          analyzer_inputs[input_tensor.name] = input_tensor
      table_initializers.extend(phase.table_initializers)
      unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(
          graph, inputs, analyzer_inputs, unbound_saved_model_dir)
      saved_model_dir = (
          tensor_pcoll_mapping
          | 'CreateSavedModelForAnaylzerInputs[%d]' % level
          >> _ReplaceTensorsWithConstants(
              unbound_saved_model_dir, base_temp_dir, input_values.pipeline))

      # Run this saved model on the input dataset to obtain the inputs to the
      # analyzers.
      analyzer_input_values = (
          input_values
          | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
              _RunMetaGraphDoFn(
                  input_schema,
                  serialized_tf_config,
                  shared_graph_state_handle=shared.Shared()),
              saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

      # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
      # map from tensor names to singleton PCollections of `_TensorValue`s.
      analyzer_outputs_dict = (
          analyzer_input_values
          | 'ComputeAnalyzerOutputs[%d]' % level
          >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir))

      # Update the mapping for all analyzers.
      tensor_pcoll_mapping.update(analyzer_outputs_dict)

    del table_initializers[:]
    table_initializers.extend(original_table_initializers)
    saved_model_dir = _make_unique_temp_dir(base_temp_dir)
    _write_saved_transform(graph, inputs, outputs, saved_model_dir)
    transform_fn = (
        tensor_pcoll_mapping
        | 'ReplaceTensorsWithConstants'
        >> _ReplaceTensorsWithConstants(
            saved_model_dir, base_temp_dir, input_values.pipeline))

    # Infer metadata.  The metadata may contain Futures that refer to the values
    # of tensors in the graph.  In that case, the tensors must be "constant" in
    # that they don't depend on input data.  The tensors can depend on analyzer
    # outputs though.  This allows us to set metadata that depends on analyzer
    # outputs.
    #
    # We first extract the names of the tensors that are referenced by the
    # Futures, and then compute them by calling _ComputeScalarConstants with the
    # tensor-PCollection mapping representing the analyzer outputs.
    metadata = dataset_metadata.DatasetMetadata(
        schema=impl_helper.infer_feature_schema(graph, outputs))

    deferred_metadata_tensor_names = [
        future.name
        for column_schema in tft_api.get_column_schemas(graph).values()
        for future in column_schema.substitute_futures({})]
    name_pcoll_dict = (
        tensor_pcoll_mapping
        | 'ComputeTensorValues' >>
        _ComputeTensorValues(
            deferred_metadata_tensor_names, saved_model_dir,
            input_values.pipeline))
    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, name_pcoll_dict)

    _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

    return transform_fn, full_metadata