Beispiel #1
0
    def test_min_bundle_size(self):
        file_name = self._write_data(count=120, row_group_size=20)

        source = _create_parquet_source(file_name,
                                        min_bundle_size=100 * 1024 * 1024)
        splits = [split for split in source.split(desired_bundle_size=1)]
        self.assertEqual(len(splits), 1)

        source = _create_parquet_source(file_name, min_bundle_size=0)
        splits = [split for split in source.split(desired_bundle_size=1)]
        self.assertNotEqual(len(splits), 1)
Beispiel #2
0
    def test_split_points(self):
        file_name = self._write_data(count=12000, row_group_size=3000)
        source = _create_parquet_source(file_name)

        splits = [
            split for split in source.split(desired_bundle_size=float('inf'))
        ]
        assert len(splits) == 1

        range_tracker = splits[0].source.get_range_tracker(
            splits[0].start_position, splits[0].stop_position)

        split_points_report = []

        for _ in splits[0].source.read(range_tracker):
            split_points_report.append(range_tracker.split_points())

        # There are a total of four row groups. Each row group has 3000 records.

        # When reading records of the first group, range_tracker.split_points()
        # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
        self.assertEqual(split_points_report, [
            (0, RangeTracker.SPLIT_POINTS_UNKNOWN),
            (1, RangeTracker.SPLIT_POINTS_UNKNOWN),
            (2, RangeTracker.SPLIT_POINTS_UNKNOWN),
            (3, 1),
        ])
Beispiel #3
0
  def test_dynamic_work_rebalancing(self):
    file_name = self._write_data(count=120, row_group_size=20)
    source = _create_parquet_source(file_name)

    splits = [split for split in source.split(desired_bundle_size=float('inf'))]
    assert len(splits) == 1

    source_test_utils.assert_split_at_fraction_exhaustive(
        splits[0].source, splits[0].start_position, splits[0].stop_position)
Beispiel #4
0
  def test_source_display_data(self):
    file_name = 'some_parquet_source'
    source = \
        _create_parquet_source(
            file_name,
            validate=False
        )
    dd = DisplayData.create_from(source)

    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Beispiel #5
0
  def _run_parquet_test(self, pattern, columns, desired_bundle_size,
                        perform_splitting, expected_result):
    source = _create_parquet_source(pattern, columns=columns)
    if perform_splitting:
      assert desired_bundle_size
      sources_info = [
          (split.source, split.start_position, split.stop_position)
          for split in source.split(desired_bundle_size=desired_bundle_size)
      ]
      if len(sources_info) < 2:
        raise ValueError('Test is trivial. Please adjust it so that at least '
                         'two splits get generated')

      source_test_utils.assert_sources_equal_reference_source(
          (source, None, None), sources_info)
    else:
      read_records = source_test_utils.read_from_source(source, None, None)
      self.assertCountEqual(expected_result, read_records)
Beispiel #6
0
 def test_read_reentrant(self):
     file_name = self._write_data(count=6, row_group_size=3)
     source = _create_parquet_source(file_name)
     source_test_utils.assert_reentrant_reads_succeed((source, None, None))