Example #1
0
  def _OptimizeRun(
      pipeline: beam.Pipeline, input_cache_dir: Text, output_cache_dir: Text,
      analyze_data_list: List[_Dataset], feature_spec: Mapping[Text, Any],
      preprocessing_fn: Any, cache_source: beam.PTransform
  ) -> Tuple[Dict[Text, Optional[_Dataset]], Dict[Text, Dict[
      Text, beam.pvalue.PCollection]], bool]:
    """Utilizes TFT cache if applicable and removes unused datasets."""

    analysis_key_to_dataset = {
        analyzer_cache.make_dataset_key(dataset.file_pattern_suffix): dataset
        for dataset in analyze_data_list
    }
    if input_cache_dir is not None:
      input_cache = pipeline | analyzer_cache.ReadAnalysisCacheFromFS(
          input_cache_dir,
          list(analysis_key_to_dataset.keys()),
          source=cache_source)
    elif output_cache_dir is not None:
      input_cache = {}
    else:
      # Using None here to indicate that this pipeline will not read or write
      # cache.
      input_cache = None

    if input_cache is None:
      # Cache is disabled so we won't be filtering out any datasets, and will
      # always perform a flatten over all of them.
      filtered_analysis_dataset_keys = list(analysis_key_to_dataset.keys())
      flat_data_required = True
    else:
      filtered_analysis_dataset_keys, flat_data_required = (
          tft_beam.analysis_graph_builder.get_analysis_dataset_keys(
              preprocessing_fn, feature_spec,
              list(analysis_key_to_dataset.keys()), input_cache))
    if len(filtered_analysis_dataset_keys) < len(analysis_key_to_dataset):
      tf.logging.info('Not reading the following datasets due to cache: %s', [
          v.file_pattern_suffix
          for k, v in analysis_key_to_dataset.items()
          if k not in filtered_analysis_dataset_keys
      ])

    new_analyze_data_dict = {}
    for key, dataset in six.iteritems(analysis_key_to_dataset):
      if key in filtered_analysis_dataset_keys:
        new_analyze_data_dict[key] = dataset
      else:
        new_analyze_data_dict[key] = None

    return (new_analyze_data_dict, input_cache, flat_data_required)
Example #2
0
  def test_cache_helpers_with_alternative_io(self):

    class LocalSink(beam.PTransform):

      def __init__(self, path):
        self._path = path

      def expand(self, pcoll):

        def write_to_file(value):
          tf.io.gfile.makedirs(self._path)
          with open(os.path.join(self._path, 'cache'), 'wb') as f:
            f.write(value)

        return pcoll | beam.Map(write_to_file)

    test_cache_dict = {
        analyzer_cache.DatasetKey('a'): {
            'b': [bytes([17, 19, 27, 31])]
        }
    }

    class LocalSource(beam.PTransform):

      def __init__(self, path):
        del path

      def expand(self, pbegin):
        return pbegin | beam.Create([test_cache_dict['a']['b']])

    dataset_keys = list(test_cache_dict.keys())
    cache_dir = self.get_temp_dir()
    with beam.Pipeline() as p:
      _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, cache_dir, dataset_keys, sink=LocalSink)

      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          cache_dir, dataset_keys, source=LocalSource)

      self.assertItemsEqual(read_cache.keys(), ['a'])
      self.assertItemsEqual(read_cache['a'].keys(), ['b'])

      beam_test_util.assert_that(
          read_cache['a']['b'],
          beam_test_util.equal_to([test_cache_dict['a']['b']]))
Example #3
0
  def test_cache_helpers_round_trip(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    with beam.Pipeline() as p:
      cache_pcoll_dict = {
          'dataset_key_0': {
              'a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
              'b': p | 'CreateB' >> beam.Create([b'[5]']),
          },
          'dataset_key_1': {
              'c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
          },
      }
      _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
          base_test_dir)

    with beam.Pipeline() as p:
      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          base_test_dir, list(cache_pcoll_dict.keys()))

      def assert_equal_matcher(expected_encoded):

        def _assert_equal(encoded_cache_list):
          (encode_cache,) = encoded_cache_list
          self.assertEqual(expected_encoded, encode_cache)

        return _assert_equal

      beam_test_util.assert_that(
          read_cache['dataset_key_0']['a'],
          beam_test_util.equal_to([b'[1, 2, 3]']),
          label='AssertA')
      beam_test_util.assert_that(
          read_cache['dataset_key_0']['b'],
          assert_equal_matcher(b'[5]'),
          label='AssertB')
      beam_test_util.assert_that(
          read_cache['dataset_key_1']['c'],
          assert_equal_matcher(b'[9, 5, 2, 1]'),
          label='AssertC')
Example #4
0
  def test_cache_helpers_round_trip(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0')
    dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1')
    dataset_keys = (dataset_key_0, dataset_key_1)

    with beam.Pipeline() as p:
      cache_pcoll_dict = {
          dataset_key_0: {
              b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
              b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']),
              b'\x8b1': p | 'CreateB1' >> beam.Create([b'[6]']),
          },
          dataset_key_1: {
              b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
          },
      }

      _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, base_test_dir, dataset_keys)

    with beam.Pipeline() as p:
      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          base_test_dir, list(cache_pcoll_dict.keys()),
          [b'\x8a', b'\x8b', b'\x8c'])

      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8a'],
          beam_test_util.equal_to([b'[1, 2, 3]']),
          label='AssertA')
      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8b'],
          beam_test_util.equal_to([b'[5]']),
          label='AssertB')
      beam_test_util.assert_that(
          read_cache[dataset_key_1][b'\x8c'],
          beam_test_util.equal_to([b'[9, 5, 2, 1]']),
          label='AssertC')
Example #5
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                transform_fn_1, cache_output = (
                    (flat_data, input_data_pcoll_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = (cache_output | 'WriteCache' >>
                     analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir))

                transformed_dataset = (
                    ((input_data_pcoll_dict[span_1_key], input_metadata),
                     transform_fn_1)
                    | 'Transform' >> beam_impl.TransformDataset())

                del input_data_pcoll_dict
                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn_1')
                _ = transform_fn_1 | tft_beam.WriteTransformFn(
                    transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                    self._cache_dir, list(input_data_dict.keys()))

                transform_fn_2, second_output_cache = (
                    (flat_data, input_data_pcoll_dict, input_cache,
                     input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn_2)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
Example #6
0
  def test_single_phase_run_twice(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1')

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          's_integerized':
              tft.compute_and_apply_vocabulary(
                  inputs['s'],
                  labels=inputs['label'],
                  use_adjusted_mutual_info=True),
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'a',
            'label': 0,
        }, {
            'x': 4,
            'y': -4,
            's': 'a',
            'label': 1,
        }, {
            'x': 5,
            'y': 11,
            's': 'a',
            'label': 1,
        }, {
            'x': 1,
            'y': -4,
            's': u'ȟαŽ₯π’‹Η©ΔΎαΈΏκž‘Θ―π˜±π‘žπ—‹π˜΄'.encode('utf-8'),
            'label': 1,
        }],
        span_1_key: [{
            'x': 12,
            'y': 1,
            's': u'ȟαŽ₯π’‹Η©ΔΎαΈΏκž‘Θ―π˜±π‘žπ—‹π˜΄'.encode('utf-8'),
            'label': 0
        }, {
            'x': 10,
            'y': 1,
            's': 'c',
            'label': 1
        }],
    }
    expected_vocabulary_contents = np.array(
        [b'a', u'ȟαŽ₯π’‹Η©ΔΎαΈΏκž‘Θ―π˜±π‘žπ—‹π˜΄'.encode('utf-8'), b'c'],
        dtype=object)
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_1, cache_output = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata)
          | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = (
          cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
              self._cache_dir))

      transformed_dataset = ((
          (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1)
                             | 'Transform' >> beam_impl.TransformDataset())

      del input_data_pcoll_dict
      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed_data = [
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 0,
          },
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 2,
          },
      ]
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1')
      _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir)

      for key in input_data_dict:
        self.assertIn(key, cache_output)
        self.assertEqual(7, len(cache_output[key]))

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          self._cache_dir, list(input_data_dict.keys()))

      transform_fn_2, second_output_cache = (
          (flat_data, input_data_pcoll_dict, input_cache, input_metadata)
          | 'AnalyzeAgain' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn_2)
                             | 'TransformAgain' >> beam_impl.TransformDataset())
      transformed_data, unused_transformed_metadata = transformed_dataset
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='second')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2')
      _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir)

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    self.assertFalse(second_output_cache)

    # Only 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)

    # The root CreateSavedModel is optimized away because the data doesn't get
    # processed at all (only cache).
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)