Ejemplo n.º 1
0
    def test_split_points(self):
        file_name = self._write_data(count=12000)
        source = AvroSource(file_name)

        splits = [
            split for split in source.split(desired_bundle_size=float('inf'))
        ]
        assert len(splits) == 1

        range_tracker = splits[0].source.get_range_tracker(
            splits[0].start_position, splits[0].stop_position)

        split_points_report = []

        for _ in splits[0].source.read(range_tracker):
            split_points_report.append(range_tracker.split_points())

        # There are a total of three blocks. Each block has more than 10 records.

        # When reading records of the first block, range_tracker.split_points()
        # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
        self.assertEquals(split_points_report[:10],
                          [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10)

        # When reading records of last block, range_tracker.split_points() should
        # return (2, 1)
        self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
Ejemplo n.º 2
0
 def test_read_reantrant_with_splitting(self):
     file_name = self._write_data()
     source = AvroSource(file_name)
     splits = [split for split in source.split(desired_bundle_size=100000)]
     assert len(splits) == 1
     source_test_utils.assertReentrantReadsSucceed(
         (splits[0].source, splits[0].start_position,
          splits[0].stop_position))
Ejemplo n.º 3
0
  def test_source_display_data(self):
    file_name = 'some_avro_source'
    source = AvroSource(file_name, validate=False)
    dd = DisplayData.create_from(source)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Ejemplo n.º 4
0
 def test_dynamic_work_rebalancing_exhaustive(self):
   # Adjusting block size so that we can perform a exhaustive dynamic
   # work rebalancing test that completes within an acceptable amount of time.
   old_sync_interval = avro.datafile.SYNC_INTERVAL
   try:
     avro.datafile.SYNC_INTERVAL = 2
     file_name = self._write_data(count=5)
     source = AvroSource(file_name)
     splits = [split
               for split in source.split(desired_bundle_size=float('inf'))]
     assert len(splits) == 1
     source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source)
   finally:
     avro.datafile.SYNC_INTERVAL = old_sync_interval
Ejemplo n.º 5
0
  def test_corrupted_file(self):
    file_name = self._write_data()
    with open(file_name, 'rb') as f:
      data = f.read()

    # Corrupt the last character of the file which is also the last character of
    # the last sync_marker.
    last_char_index = len(data) - 1
    corrupted_data = data[:last_char_index]
    corrupted_data += 'A' if data[last_char_index] == 'B' else 'B'
    with tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template) as f:
      f.write(corrupted_data)
      corrupted_file_name = f.name

    source = AvroSource(corrupted_file_name)
    with self.assertRaises(ValueError) as exn:
      source_test_utils.read_from_source(source, None, None)
      self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
Ejemplo n.º 6
0
    def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                       expected_result):
        source = AvroSource(pattern)

        read_records = []
        if perform_splitting:
            assert desired_bundle_size
            splits = [
                split for split in source.split(
                    desired_bundle_size=desired_bundle_size)
            ]
            if len(splits) < 2:
                raise ValueError(
                    'Test is trivial. Please adjust it so that at least '
                    'two splits get generated')

            sources_info = [(split.source, split.start_position,
                             split.stop_position) for split in splits]
            source_test_utils.assertSourcesEqualReferenceSource(
                (source, None, None), sources_info)
        else:
            read_records = source_test_utils.readFromSource(source, None, None)
            self.assertItemsEqual(expected_result, read_records)
Ejemplo n.º 7
0
 def test_read_reentrant_without_splitting(self):
     file_name = self._write_data()
     source = AvroSource(file_name)
     source_test_utils.assertReentrantReadsSucceed((source, None, None))