Exemple #1
0
 def test_read_from_text_with_file_name_file_pattern(self):
   prefix = datetime.datetime.now().strftime("%Y%m%d%H%M%s")
   file_name_1, data_1 = write_data(5, prefix=prefix)
   file_name_2, data_2 = write_data(5, prefix=prefix)
   expected_data = []
   expected_data.extend([(file_name_1, el) for el in data_1])
   expected_data.extend([(file_name_2, el) for el in data_2])
   folder = file_name_1[:file_name_1.rfind(os.path.sep)]
   pattern = folder + os.path.sep + prefix + '*'
   assert len(expected_data) == 10
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
 def test_read_from_text_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
  def test_progress(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = list(source.split(desired_bundle_size=100000))
    assert len(splits) == 1
    fraction_consumed_report = []
    split_points_report = []
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    for _ in splits[0].source.read(range_tracker):
      fraction_consumed_report.append(range_tracker.fraction_consumed())
      split_points_report.append(range_tracker.split_points())

    self.assertEqual(
        [float(i) / 10 for i in range(0, 10)], fraction_consumed_report)
    expected_split_points_report = [
        ((i - 1), iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
        for i in range(1, 10)]

    # At last split point, the remaining split points callback returns 1 since
    # the expected position of next record becomes equal to the stop position.
    expected_split_points_report.append((9, 1))

    self.assertEqual(
        expected_split_points_report, split_points_report)
 def test_read_all_many_single_files(self):
   file_name1, expected_data1 = write_data(5)
   assert len(expected_data1) == 5
   file_name2, expected_data2 = write_data(10)
   assert len(expected_data2) == 10
   file_name3, expected_data3 = write_data(15)
   assert len(expected_data3) == 15
   expected_data = []
   expected_data.extend(expected_data1)
   expected_data.extend(expected_data2)
   expected_data.extend(expected_data3)
   pipeline = TestPipeline()
   pcoll = pipeline | 'Create' >> Create(
       [file_name1, file_name2, file_name3]) |'ReadAll' >> ReadAllFromText()
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Exemple #5
0
 def test_read_from_text_with_file_name_single_file(self):
   file_name, data = write_data(5)
   expected_data = [(file_name, el) for el in data]
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
  def test_read_single_file_with_empty_lines(self):
    file_name, expected_data = write_data(
        TextSourceTest.DEFAULT_NUM_RECORDS, no_data=True, eol=EOL.LF)

    assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS
    assert not expected_data[0]

    self._run_read_test(file_name, expected_data)
 def test_read_all_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Create' >> Create(
       [file_name]) |'ReadAll' >> ReadAllFromText()
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
 def test_dynamic_work_rebalancing(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_split_at_fraction_exhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position)
 def test_read_reentrant_after_splitting(self):
   file_name, expected_data = write_data(10)
   assert len(expected_data) == 10
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_reentrant_reads_succeed(
       (splits[0].source, splits[0].start_position, splits[0].stop_position))
Exemple #10
0
  def test_read_empty_single_file(self):
    file_name, written_data = write_data(
        1, no_data=True, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE)

    assert len(written_data) == 1
    # written data has a single entry with an empty string. Reading the source
    # should not produce anything since we only wrote a single empty string
    # without an end of line character.
    self._run_read_test(file_name, [])
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file(suffix='.bz2')
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
Exemple #12
0
 def test_dynamic_work_rebalancing_windows_eol(self):
   file_name, expected_data = write_data(15, eol=EOL.CRLF)
   assert len(expected_data) == 15
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = list(source.split(desired_bundle_size=100000))
   assert len(splits) == 1
   source_test_utils.assert_split_at_fraction_exhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position,
       perform_multi_threaded_test=False)
Exemple #13
0
 def test_read_all_unavailable_files_ignored(self):
   file_name1, expected_data1 = write_data(5)
   assert len(expected_data1) == 5
   file_name2, expected_data2 = write_data(10)
   assert len(expected_data2) == 10
   file_name3, expected_data3 = write_data(15)
   assert len(expected_data3) == 15
   file_name4 = "/unavailable_file"
   expected_data = []
   expected_data.extend(expected_data1)
   expected_data.extend(expected_data2)
   expected_data.extend(expected_data3)
   pipeline = TestPipeline()
   pcoll = (pipeline
            | 'Create' >> Create(
                [file_name1, file_name2, file_name3, file_name4])
            |'ReadAll' >> ReadAllFromText())
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
 def test_dynamic_work_rebalancing_mixed_eol(self):
   file_name, expected_data = write_data(5, eol=EOL.MIXED)
   assert len(expected_data) == 5
   source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                       coders.StrUtf8Coder())
   splits = [split for split in source.split(desired_bundle_size=100000)]
   assert len(splits) == 1
   source_test_utils.assertSplitAtFractionExhaustive(
       splits[0].source, splits[0].start_position, splits[0].stop_position,
       perform_multi_threaded_test=False)
Exemple #15
0
  def test_read_single_file_without_striping_eol_crlf(self):
    file_name, written_data = write_data(TextSourceTest.DEFAULT_NUM_RECORDS,
                                         eol=EOL.CRLF)
    assert len(written_data) == TextSourceTest.DEFAULT_NUM_RECORDS
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED,
                        False, coders.StrUtf8Coder())

    range_tracker = source.get_range_tracker(None, None)
    read_data = list(source.read(range_tracker))
    self.assertCountEqual([line + '\r\n' for line in written_data], read_data)
Exemple #16
0
 def test_read_skip_header_single(self):
   file_name, expected_data = write_data(TextSourceTest.DEFAULT_NUM_RECORDS)
   assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS
   skip_header_lines = 1
   expected_data = self._remove_lines(expected_data,
                                      [TextSourceTest.DEFAULT_NUM_RECORDS],
                                      skip_header_lines)
   read_data = self._read_skip_header_lines(file_name, skip_header_lines)
   self.assertEqual(len(expected_data), len(read_data))
   self.assertCountEqual(expected_data, read_data)
Exemple #17
0
  def test_read_single_file_single_line_no_eol_gzip(self):
    file_name, expected_data = write_data(
        1, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE)

    gzip_file_name = file_name + '.gz'
    with open(file_name) as src, gzip.open(gzip_file_name, 'wb') as dst:
      dst.writelines(src)

    assert len(expected_data) == 1
    self._run_read_test(gzip_file_name, expected_data,
                        compression=CompressionTypes.GZIP)
  def test_read_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name,
        compression_type=CompressionTypes.BZIP2)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with gzip.GzipFile(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name, 0, CompressionTypes.GZIP,
        True, coders.StrUtf8Coder(), skip_header_lines=2)
    assert_that(pcoll, equal_to(lines[2:]))
    pipeline.run()
 def test_read_all_gzip(self):
   _, lines = write_data(100)
   file_name = self._create_temp_file()
   with gzip.GzipFile(file_name, 'wb') as f:
     f.write('\n'.join(lines))
   pipeline = TestPipeline()
   pcoll = (pipeline
            | Create([file_name])
            | 'ReadAll' >> ReadAllFromText(
                compression_type=CompressionTypes.GZIP))
   assert_that(pcoll, equal_to(lines))
   pipeline.run()
Exemple #21
0
  def test_read_auto_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.gz')

      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(file_name)
      assert_that(pcoll, equal_to(lines))
      pipeline.run()
Exemple #22
0
  def test_read_after_splitting(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    source_test_utils.assert_sources_equal_reference_source(
        reference_source_info, sources_info)
Exemple #23
0
  def test_read_empty_single_file_no_eol_gzip(self):
    file_name, written_data = write_data(
        1, no_data=True, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE)

    gzip_file_name = file_name + '.gz'
    with open(file_name) as src, gzip.open(gzip_file_name, 'wb') as dst:
      dst.writelines(src)

    assert len(written_data) == 1
    # written data has a single entry with an empty string. Reading the source
    # should not produce anything since we only wrote a single empty string
    # without an end of line character.
    self._run_read_test(gzip_file_name, [], compression=CompressionTypes.GZIP)
Exemple #24
0
  def test_read_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(
          file_name,
          0, CompressionTypes.GZIP,
          True, coders.StrUtf8Coder())
      assert_that(pcoll, equal_to(lines))
      pipeline.run()
Exemple #25
0
  def test_read_from_text_single_file_with_coder(self):
    class DummyCoder(coders.Coder):
      def encode(self, x):
        raise ValueError

      def decode(self, x):
        return x * 2

    file_name, expected_data = write_data(5)
    assert len(expected_data) == 5
    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder())
    assert_that(pcoll, equal_to([record * 2 for record in expected_data]))
    pipeline.run()
  def test_progress(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder())
    splits = [split for split in source.split(desired_bundle_size=100000)]
    assert len(splits) == 1
    fraction_consumed_report = []
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    for _ in splits[0].source.read(range_tracker):
      fraction_consumed_report.append(range_tracker.fraction_consumed())

    self.assertEqual(
        [float(i) / 10 for i in range(0, 10)], fraction_consumed_report)
  def test_read_corrupted_bzip2_fails(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    with open(file_name, 'wb') as f:
      f.write('corrupt')

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name,
        compression_type=CompressionTypes.BZIP2)
    assert_that(pcoll, equal_to(lines))
    with self.assertRaises(Exception):
      pipeline.run()
  def test_read_corrupted_gzip_fails(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with gzip.GzipFile(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    with open(file_name, 'wb') as f:
      f.write('corrupt')

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name,
        0, CompressionTypes.GZIP,
        True, coders.StrUtf8Coder())
    assert_that(pcoll, equal_to(lines))

    with self.assertRaises(Exception):
      pipeline.run()
Exemple #29
0
  def test_read_after_splitting_skip_header(self):
    file_name, expected_data = write_data(100)
    assert len(expected_data) == 100
    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(), skip_header_lines=2)
    splits = list(source.split(desired_bundle_size=33))

    reference_source_info = (source, None, None)
    sources_info = ([
        (split.source, split.start_position, split.stop_position) for
        split in splits])
    self.assertGreater(len(sources_info), 1)
    reference_lines = source_test_utils.read_from_source(*reference_source_info)
    split_lines = []
    for source_info in sources_info:
      split_lines.extend(source_test_utils.read_from_source(*source_info))

    self.assertEqual(expected_data[2:], reference_lines)
    self.assertEqual(reference_lines, split_lines)
  def run_sdf_read_pipeline(
      self, num_files, num_records_per_file, resume_count=None):
    expected_data = []
    file_names = []
    for _ in range(num_files):
      new_file_name, new_expected_data = filebasedsource_test.write_data(
          num_records_per_file)
      assert len(new_expected_data) == num_records_per_file
      file_names.append(new_file_name)
      expected_data.extend(new_expected_data)

    assert len(expected_data) > 0

    with TestPipeline() as p:
      pc1 = (p
             | 'Create1' >> beam.Create(file_names)
             | 'SDF' >> beam.ParDo(ReadFiles(resume_count)))

      assert_that(pc1, equal_to(expected_data))
Exemple #31
0
 def test_read_single_file_larger_than_default_buffer(self):
     file_name, expected_data = write_data(
         TextSource.DEFAULT_READ_BUFFER_SIZE)
     self._run_read_test(file_name,
                         expected_data,
                         buffer_size=TextSource.DEFAULT_READ_BUFFER_SIZE)
Exemple #32
0
 def test_read_single_file_last_line_no_eol(self):
     file_name, expected_data = write_data(
         TextSourceTest.DEFAULT_NUM_RECORDS,
         eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE)
     assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS
     self._run_read_test(file_name, expected_data)
Exemple #33
0
    def test_read_single_file_single_line_no_eol(self):
        file_name, expected_data = write_data(
            1, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE)

        assert len(expected_data) == 1
        self._run_read_test(file_name, expected_data)
Exemple #34
0
 def test_read_reentrant_without_splitting(self):
     file_name, expected_data = write_data(10)
     assert len(expected_data) == 10
     source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                         coders.StrUtf8Coder())
     source_test_utils.assertReentrantReadsSucceed((source, None, None))
Exemple #35
0
 def test_read_single_file_mixed_eol(self):
     file_name, expected_data = write_data(
         TextSourceTest.DEFAULT_NUM_RECORDS, eol=EOL.MIXED)
     assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS
     self._run_read_test(file_name, expected_data)