def test_cache_merge(self):
        base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0')
        dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1')
        dataset_keys = (dataset_key_0, dataset_key_1)
        cache_keys = list('abcd')

        def read_manifests():
            return [
                analyzer_cache._ManifestFile(
                    analyzer_cache._get_dataset_cache_path(base_test_dir,
                                                           key)).read()
                for key in dataset_keys
            ]

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
                dataset_key_1: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        first_manifests = read_manifests()

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
                dataset_key_1: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        second_manifests = read_manifests()
        self.assertEqual(len(first_manifests), len(second_manifests))
        for manifest_a, manifest_b in zip(first_manifests, second_manifests):
            for key_value_pair in manifest_a.items():
                self.assertIn(key_value_pair, manifest_b.items())

            self.assertEqual(2, len(manifest_a))
            self.assertCountEqual(range(len(manifest_a)), manifest_a.values())

            self.assertEqual(4, len(manifest_b))
            self.assertCountEqual(range(len(manifest_b)), manifest_b.values())
            self.assertCountEqual(cache_keys, manifest_b.keys())
Beispiel #2
0
  def test_validate_dataset_keys(self):
    analyzer_cache.validate_dataset_keys({
        analyzer_cache.DatasetKey(k)
        for k in ('foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1', 'foo@1', 'foo*',
                  'foo[]', 'foo/goo')
    })

    for key in {analyzer_cache.DatasetKey(k) for k in ('^foo^', 'foo 1')}:
      with self.assertRaisesRegexp(
          ValueError, 'Dataset key .* does not match allowed pattern:'):
        analyzer_cache.validate_dataset_keys({key})
  def test_cache_write_empty(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    with beam.Pipeline() as p:
      _ = {} | analyzer_cache.WriteAnalysisCacheToFS(
          p, base_test_dir, (analyzer_cache.DatasetKey('dataset_key_0'),))
    self.assertFalse(os.path.isdir(base_test_dir))
Beispiel #4
0
  def test_cache_helpers_round_trip(self):
    base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0')
    dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1')
    dataset_keys = (dataset_key_0, dataset_key_1)

    with beam.Pipeline() as p:
      cache_pcoll_dict = {
          dataset_key_0: {
              b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
              b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']),
              b'\x8b1': p | 'CreateB1' >> beam.Create([b'[6]']),
          },
          dataset_key_1: {
              b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
          },
      }

      _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, base_test_dir, dataset_keys)

    with beam.Pipeline() as p:
      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          base_test_dir, list(cache_pcoll_dict.keys()),
          [b'\x8a', b'\x8b', b'\x8c'])

      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8a'],
          beam_test_util.equal_to([b'[1, 2, 3]']),
          label='AssertA')
      beam_test_util.assert_that(
          read_cache[dataset_key_0][b'\x8b'],
          beam_test_util.equal_to([b'[5]']),
          label='AssertB')
      beam_test_util.assert_that(
          read_cache[dataset_key_1][b'\x8c'],
          beam_test_util.equal_to([b'[9, 5, 2, 1]']),
          label='AssertC')
Beispiel #5
0
  def test_cache_helpers_with_alternative_io(self):

    class LocalSink(beam.PTransform):

      def __init__(self, path):
        self._path = path

      def expand(self, pcoll):

        def write_to_file(value):
          tf.io.gfile.makedirs(self._path)
          with open(os.path.join(self._path, 'cache'), 'wb') as f:
            f.write(value)

        return pcoll | beam.Map(write_to_file)

    test_cache_dict = {
        analyzer_cache.DatasetKey('a'): {
            'b': [bytes([17, 19, 27, 31])]
        }
    }

    class LocalSource(beam.PTransform):

      def __init__(self, path):
        del path

      def expand(self, pbegin):
        return pbegin | beam.Create([test_cache_dict['a']['b']])

    dataset_keys = list(test_cache_dict.keys())
    cache_dir = self.get_temp_dir()
    with beam.Pipeline() as p:
      _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS(
          p, cache_dir, dataset_keys, sink=LocalSink)

      read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          cache_dir, dataset_keys, source=LocalSource)

      self.assertItemsEqual(read_cache.keys(), ['a'])
      self.assertItemsEqual(read_cache['a'].keys(), ['b'])

      beam_test_util.assert_that(
          read_cache['a']['b'],
          beam_test_util.equal_to([test_cache_dict['a']['b']]))
Beispiel #6
0
  def __init__(self, file_pattern: Text,
               materialize_output_path: Optional[Text] = None):
    """Initialize a Dataset.

    Args:
      file_pattern: The file pattern of the dataset.
      materialize_output_path: The file path where to write the dataset.
    """
    file_pattern_suffix = os.path.join(
        *file_pattern.split(os.sep)[-self._FILE_PATTERN_SUFFIX_LENGTH:])
    self._file_pattern = file_pattern
    self._materialize_output_path = materialize_output_path
    self._index = None
    self._serialized = None
    self._decoded = None
    self._transformed = None
    self._transformed_and_serialized = None
    if hasattr(analyzer_cache, 'DatasetKey'):
      self._dataset_key = analyzer_cache.DatasetKey(file_pattern_suffix)
    else:
      self._dataset_key = analyzer_cache.make_dataset_key(file_pattern_suffix)
    print('-'*50)
    print(self._dataset_key)
    print('-'*50)