Beispiel #1
0
 def test_estimate_size(self):
     source = ConcatSource([
         RangeSource(0, 10),
         RangeSource(10, 100),
         RangeSource(100, 1000),
     ])
     self.assertEqual(source.estimate_size(), 1000)
Beispiel #2
0
 def test_estimate_size(self):
     sources = [
         TestConcatSource.DummySource(range(start, start + 10))
         for start in [0, 10, 20]
     ]
     concat = ConcatSource(sources)
     self.assertEquals(30, concat.estimate_size())
Beispiel #3
0
 def test_read(self):
   sources = [TestConcatSource.DummySource(range(start, start + 10)) for start
              in [0, 10, 20]]
   concat = ConcatSource(sources)
   range_tracker = concat.get_range_tracker(None, None)
   read_data = [value for value in concat.read(range_tracker)]
   self.assertCountEqual(list(range(30)), read_data)
Beispiel #4
0
 def test_read(self):
   sources = [TestConcatSource.DummySource(range(start, start + 10)) for start
              in [0, 10, 20]]
   concat = ConcatSource(sources)
   range_tracker = concat.get_range_tracker(None, None)
   read_data = [value for value in concat.read(range_tracker)]
   self.assertItemsEqual(range(30), read_data)
  def test_position_at_fration(self):
    ranges = [(0, 4), (4, 16), (16, 24), (24, 32)]
    source = ConcatSource([iobase.SourceBundle((range[1] - range[0]) / 32.,
                                               RangeSource(*range),
                                               None, None)
                           for range in ranges])

    range_tracker = source.get_range_tracker()
    self.assertEquals(range_tracker.position_at_fraction(0), (0, 0))
    self.assertEquals(range_tracker.position_at_fraction(.01), (0, 1))
    self.assertEquals(range_tracker.position_at_fraction(.1), (0, 4))
    self.assertEquals(range_tracker.position_at_fraction(.125), (1, 4))
    self.assertEquals(range_tracker.position_at_fraction(.2), (1, 7))
    self.assertEquals(range_tracker.position_at_fraction(.7), (2, 23))
    self.assertEquals(range_tracker.position_at_fraction(.75), (3, 24))
    self.assertEquals(range_tracker.position_at_fraction(.8), (3, 26))
    self.assertEquals(range_tracker.position_at_fraction(1), (4, None))

    range_tracker = source.get_range_tracker((1, None), (3, None))
    self.assertEquals(range_tracker.position_at_fraction(0), (1, 4))
    self.assertEquals(range_tracker.position_at_fraction(.01), (1, 5))
    self.assertEquals(range_tracker.position_at_fraction(.5), (1, 14))
    self.assertEquals(range_tracker.position_at_fraction(.599), (1, 16))
    self.assertEquals(range_tracker.position_at_fraction(.601), (2, 17))
    self.assertEquals(range_tracker.position_at_fraction(1), (3, None))
Beispiel #6
0
 def test_fraction_consumed_at_end(self):
     source = ConcatSource([
         RangeSource(0, 2),
         RangeSource(2, 4),
     ])
     range_tracker = source.get_range_tracker((2, None), None)
     self.assertEqual(range_tracker.fraction_consumed(), 1.0)
Beispiel #7
0
    def test_position_at_fration(self):
        ranges = [(0, 4), (4, 16), (16, 24), (24, 32)]
        source = ConcatSource([
            iobase.SourceBundle((range[1] - range[0]) / 32.,
                                RangeSource(*range), None, None)
            for range in ranges
        ])

        range_tracker = source.get_range_tracker()
        self.assertEqual(range_tracker.position_at_fraction(0), (0, 0))
        self.assertEqual(range_tracker.position_at_fraction(.01), (0, 1))
        self.assertEqual(range_tracker.position_at_fraction(.1), (0, 4))
        self.assertEqual(range_tracker.position_at_fraction(.125), (1, 4))
        self.assertEqual(range_tracker.position_at_fraction(.2), (1, 7))
        self.assertEqual(range_tracker.position_at_fraction(.7), (2, 23))
        self.assertEqual(range_tracker.position_at_fraction(.75), (3, 24))
        self.assertEqual(range_tracker.position_at_fraction(.8), (3, 26))
        self.assertEqual(range_tracker.position_at_fraction(1), (4, None))

        range_tracker = source.get_range_tracker((1, None), (3, None))
        self.assertEqual(range_tracker.position_at_fraction(0), (1, 4))
        self.assertEqual(range_tracker.position_at_fraction(.01), (1, 5))
        self.assertEqual(range_tracker.position_at_fraction(.5), (1, 14))
        self.assertEqual(range_tracker.position_at_fraction(.599), (1, 16))
        self.assertEqual(range_tracker.position_at_fraction(.601), (2, 17))
        self.assertEqual(range_tracker.position_at_fraction(1), (3, None))
  def test_single_source(self):
    read_all = source_test_utils.readFromSource

    range10 = RangeSource(0, 10)
    self.assertEquals(read_all(ConcatSource([range10])), range(10))
    self.assertEquals(read_all(ConcatSource([range10]), (0, 5)), range(5, 10))
    self.assertEquals(read_all(ConcatSource([range10]), None, (0, 5)),
                      range(5))
  def test_source_with_empty_ranges(self):
    read_all = source_test_utils.readFromSource

    empty = RangeSource(0, 0)
    self.assertEquals(read_all(empty), [])

    range10 = RangeSource(0, 10)
    self.assertEquals(read_all(ConcatSource([empty, empty, range10])),
                      range(10))
    self.assertEquals(read_all(ConcatSource([empty, range10, empty])),
                      range(10))
    self.assertEquals(read_all(ConcatSource([range10, empty, range10, empty])),
                      range(10) + range(10))
Beispiel #10
0
    def _get_concat_source(self):
        if self._concat_source is None:
            patterns = self._patterns.get()

            single_file_sources = []
            for match_result in FileSystems.match(patterns):
                file_based_source_ref = pickler.loads(pickler.dumps(self))

                for file_metadata in match_result.metadata_list:
                    file_name = file_metadata.path
                    file_size = file_metadata.size_in_bytes
                    if file_size == 0:
                        continue  # Ignoring empty file.

                    # We determine splittability of this specific file.
                    splittable = (
                        self.splittable
                        and _determine_splittability_from_compression_type(
                            file_name, self._compression_type))

                    single_file_source = _SingleFileSource(
                        file_based_source_ref,
                        file_name,
                        0,
                        file_size,
                        min_bundle_size=self._min_bundle_size,
                        splittable=splittable)
                    single_file_sources.append(single_file_source)
            self._concat_source = ConcatSource(single_file_sources)
        return self._concat_source
Beispiel #11
0
  def test_split(self):
    sources = [TestConcatSource.DummySource(list(range(start, start + 10)))
               for start in [0, 10, 20]]
    concat = ConcatSource(sources)
    splits = [split for split in concat.split()]
    self.assertEquals(6, len(splits))

    # Reading all splits
    read_data = []
    for split in splits:
      range_tracker_for_split = split.source.get_range_tracker(
          split.start_position,
          split.stop_position)
      read_data.extend([value for value in split.source.read(
          range_tracker_for_split)])
    self.assertCountEqual(list(range(30)), read_data)
Beispiel #12
0
  def test_split(self):
    sources = [TestConcatSource.DummySource(range(start, start + 10)) for start
               in [0, 10, 20]]
    concat = ConcatSource(sources)
    splits = [split for split in concat.split()]
    self.assertEquals(6, len(splits))

    # Reading all splits
    read_data = []
    for split in splits:
      range_tracker_for_split = split.source.get_range_tracker(
          split.start_position,
          split.stop_position)
      read_data.extend([value for value in split.source.read(
          range_tracker_for_split)])
    self.assertItemsEqual(range(30), read_data)
Beispiel #13
0
 def test_conact_source_exhaustive(self):
     source = ConcatSource([
         RangeSource(0, 10),
         RangeSource(100, 110),
         RangeSource(1000, 1010),
     ])
     source_test_utils.assert_split_at_fraction_exhaustive(source)
Beispiel #14
0
 def test_run_concat_direct(self):
     source = ConcatSource([
         RangeSource(0, 10),
         RangeSource(10, 100),
         RangeSource(100, 1000),
     ])
     with TestPipeline() as pipeline:
         pcoll = pipeline | beam.io.Read(source)
         assert_that(pcoll, equal_to(list(range(1000))))
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
 def test_source_with_empty_ranges_exhastive(self):
   empty = RangeSource(0, 0)
   source = ConcatSource([empty,
                          RangeSource(0, 10),
                          empty,
                          empty,
                          RangeSource(10, 13),
                          RangeSource(13, 17),
                          empty,
                         ])
   source_test_utils.assertSplitAtFractionExhaustive(source)
Beispiel #17
0
 def test_concat_source_split(self):
     unused_element = None
     initial_concat_source = ConcatSource([self.initial_range_source])
     sdf_concat_restriction_provider = (
         iobase._SDFBoundedSourceWrapper.
         _SDFBoundedSourceRestrictionProvider(initial_concat_source,
                                              desired_chunk_size=2))
     restriction = (
         self.sdf_restriction_provider.initial_restriction(unused_element))
     expect_splits = [(0, 2), (2, 4)]
     split_bundles = list(
         sdf_concat_restriction_provider.split(unused_element, restriction))
     self.assertTrue(
         all([isinstance(bundle, SourceBundle)
              for bundle in split_bundles]))
     splits = ([(bundle.start_position, bundle.stop_position)
                for bundle in split_bundles])
     self.assertEqual(expect_splits, list(splits))
Beispiel #18
0
  def test_empty_source(self):
    read_all = source_test_utils.read_from_source

    empty = RangeSource(0, 0)
    self.assertEqual(read_all(ConcatSource([])), [])
    self.assertEqual(read_all(ConcatSource([empty])), [])
    self.assertEqual(read_all(ConcatSource([empty, empty])), [])

    range10 = RangeSource(0, 10)
    self.assertEqual(read_all(ConcatSource([range10]), (0, None), (0, 0)), [])
    self.assertEqual(read_all(ConcatSource([range10]), (0, 10), (1, None)), [])
    self.assertEqual(
        read_all(ConcatSource([range10, range10]), (0, 10), (1, 0)), [])
Beispiel #19
0
    def test_conact_source(self):
        source = ConcatSource([
            RangeSource(0, 4),
            RangeSource(4, 8),
            RangeSource(8, 12),
            RangeSource(12, 16),
        ])
        self.assertEqual(list(source.read(source.get_range_tracker())),
                         list(range(16)))
        self.assertEqual(
            list(source.read(source.get_range_tracker((1, None), (2, 10)))),
            list(range(4, 10)))
        range_tracker = source.get_range_tracker(None, None)
        self.assertEqual(range_tracker.position_at_fraction(0), (0, 0))
        self.assertEqual(range_tracker.position_at_fraction(.5), (2, 8))
        self.assertEqual(range_tracker.position_at_fraction(.625), (2, 10))

        # Simulate a read.
        self.assertEqual(range_tracker.try_claim((0, None)), True)
        self.assertEqual(range_tracker.sub_range_tracker(0).try_claim(2), True)
        self.assertEqual(range_tracker.fraction_consumed(), 0.125)

        self.assertEqual(range_tracker.try_claim((1, None)), True)
        self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(6), True)
        self.assertEqual(range_tracker.fraction_consumed(), 0.375)
        self.assertEqual(range_tracker.try_split((0, 1)), None)
        self.assertEqual(range_tracker.try_split((1, 5)), None)

        self.assertEqual(range_tracker.try_split((3, 14)), ((3, None), 0.75))
        self.assertEqual(range_tracker.try_claim((3, None)), False)
        self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(7), True)
        self.assertEqual(range_tracker.try_claim((2, None)), True)
        self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(9), True)

        self.assertEqual(range_tracker.try_split((2, 8)), None)
        self.assertEqual(range_tracker.try_split((2, 11)), ((2, 11), 11. / 12))
        self.assertEqual(
            range_tracker.sub_range_tracker(2).try_claim(10), True)
        self.assertEqual(
            range_tracker.sub_range_tracker(2).try_claim(11), False)
  def test_conact_source(self):
    source = ConcatSource([RangeSource(0, 4),
                           RangeSource(4, 8),
                           RangeSource(8, 12),
                           RangeSource(12, 16),
                          ])
    self.assertEqual(list(source.read(source.get_range_tracker())),
                     range(16))
    self.assertEqual(list(source.read(source.get_range_tracker((1, None),
                                                               (2, 10)))),
                     range(4, 10))
    range_tracker = source.get_range_tracker(None, None)
    self.assertEqual(range_tracker.position_at_fraction(0), (0, 0))
    self.assertEqual(range_tracker.position_at_fraction(.5), (2, 8))
    self.assertEqual(range_tracker.position_at_fraction(.625), (2, 10))

    # Simulate a read.
    self.assertEqual(range_tracker.try_claim((0, None)), True)
    self.assertEqual(range_tracker.sub_range_tracker(0).try_claim(2), True)
    self.assertEqual(range_tracker.fraction_consumed(), 0.125)

    self.assertEqual(range_tracker.try_claim((1, None)), True)
    self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(6), True)
    self.assertEqual(range_tracker.fraction_consumed(), 0.375)
    self.assertEqual(range_tracker.try_split((0, 1)), None)
    self.assertEqual(range_tracker.try_split((1, 5)), None)

    self.assertEqual(range_tracker.try_split((3, 14)), ((3, None), 0.75))
    self.assertEqual(range_tracker.try_claim((3, None)), False)
    self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(7), True)
    self.assertEqual(range_tracker.try_claim((2, None)), True)
    self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(9), True)

    self.assertEqual(range_tracker.try_split((2, 8)), None)
    self.assertEqual(range_tracker.try_split((2, 11)), ((2, 11), 11. / 12))
    self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(10), True)
    self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(11), False)
Beispiel #21
0
 def test_estimate_size(self):
   sources = [TestConcatSource.DummySource(range(start, start + 10)) for start
              in [0, 10, 20]]
   concat = ConcatSource(sources)
   self.assertEquals(30, concat.estimate_size())
 def test_estimate_size(self):
   source = ConcatSource([RangeSource(0, 10),
                          RangeSource(10, 100),
                          RangeSource(100, 1000),
                         ])
   self.assertEqual(source.estimate_size(), 1000)