def test_read_from_text_with_file_name_file_pattern(self): prefix = datetime.datetime.now().strftime("%Y%m%d%H%M%s") file_name_1, data_1 = write_data(5, prefix=prefix) file_name_2, data_2 = write_data(5, prefix=prefix) expected_data = [] expected_data.extend([(file_name_1, el) for el in data_1]) expected_data.extend([(file_name_2, el) for el in data_2]) folder = file_name_1[:file_name_1.rfind(os.path.sep)] pattern = folder + os.path.sep + prefix + '*' assert len(expected_data) == 10 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_read_from_text_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_progress(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 fraction_consumed_report = [] split_points_report = [] range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) for _ in splits[0].source.read(range_tracker): fraction_consumed_report.append(range_tracker.fraction_consumed()) split_points_report.append(range_tracker.split_points()) self.assertEqual( [float(i) / 10 for i in range(0, 10)], fraction_consumed_report) expected_split_points_report = [ ((i - 1), iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) for i in range(1, 10)] # At last split point, the remaining split points callback returns 1 since # the expected position of next record becomes equal to the stop position. expected_split_points_report.append((9, 1)) self.assertEqual( expected_split_points_report, split_points_report)
def test_read_all_many_single_files(self): file_name1, expected_data1 = write_data(5) assert len(expected_data1) == 5 file_name2, expected_data2 = write_data(10) assert len(expected_data2) == 10 file_name3, expected_data3 = write_data(15) assert len(expected_data3) == 15 expected_data = [] expected_data.extend(expected_data1) expected_data.extend(expected_data2) expected_data.extend(expected_data3) pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create( [file_name1, file_name2, file_name3]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_read_from_text_with_file_name_single_file(self): file_name, data = write_data(5) expected_data = [(file_name, el) for el in data] assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromTextWithFilename(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_read_single_file_with_empty_lines(self): file_name, expected_data = write_data( TextSourceTest.DEFAULT_NUM_RECORDS, no_data=True, eol=EOL.LF) assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS assert not expected_data[0] self._run_read_test(file_name, expected_data)
def test_read_all_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create( [file_name]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_dynamic_work_rebalancing(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position)
def test_read_reentrant_after_splitting(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_read_empty_single_file(self): file_name, written_data = write_data( 1, no_data=True, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE) assert len(written_data) == 1 # written data has a single entry with an empty string. Reading the source # should not produce anything since we only wrote a single empty string # without an end of line character. self._run_read_test(file_name, [])
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_dynamic_work_rebalancing_windows_eol(self): file_name, expected_data = write_data(15, eol=EOL.CRLF) assert len(expected_data) == 15 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position, perform_multi_threaded_test=False)
def test_read_all_unavailable_files_ignored(self): file_name1, expected_data1 = write_data(5) assert len(expected_data1) == 5 file_name2, expected_data2 = write_data(10) assert len(expected_data2) == 10 file_name3, expected_data3 = write_data(15) assert len(expected_data3) == 15 file_name4 = "/unavailable_file" expected_data = [] expected_data.extend(expected_data1) expected_data.extend(expected_data2) expected_data.extend(expected_data3) pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> Create( [file_name1, file_name2, file_name3, file_name4]) |'ReadAll' >> ReadAllFromText()) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_dynamic_work_rebalancing_mixed_eol(self): file_name, expected_data = write_data(5, eol=EOL.MIXED) assert len(expected_data) == 5 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = [split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 source_test_utils.assertSplitAtFractionExhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position, perform_multi_threaded_test=False)
def test_read_single_file_without_striping_eol_crlf(self): file_name, written_data = write_data(TextSourceTest.DEFAULT_NUM_RECORDS, eol=EOL.CRLF) assert len(written_data) == TextSourceTest.DEFAULT_NUM_RECORDS source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, False, coders.StrUtf8Coder()) range_tracker = source.get_range_tracker(None, None) read_data = list(source.read(range_tracker)) self.assertCountEqual([line + '\r\n' for line in written_data], read_data)
def test_read_skip_header_single(self): file_name, expected_data = write_data(TextSourceTest.DEFAULT_NUM_RECORDS) assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS skip_header_lines = 1 expected_data = self._remove_lines(expected_data, [TextSourceTest.DEFAULT_NUM_RECORDS], skip_header_lines) read_data = self._read_skip_header_lines(file_name, skip_header_lines) self.assertEqual(len(expected_data), len(read_data)) self.assertCountEqual(expected_data, read_data)
def test_read_single_file_single_line_no_eol_gzip(self): file_name, expected_data = write_data( 1, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE) gzip_file_name = file_name + '.gz' with open(file_name) as src, gzip.open(gzip_file_name, 'wb') as dst: dst.writelines(src) assert len(expected_data) == 1 self._run_read_test(gzip_file_name, expected_data, compression=CompressionTypes.GZIP)
def test_read_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip_with_skip_lines(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder(), skip_header_lines=2) assert_that(pcoll, equal_to(lines[2:])) pipeline.run()
def test_read_all_gzip(self): _, lines = write_data(100) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = (pipeline | Create([file_name]) | 'ReadAll' >> ReadAllFromText( compression_type=CompressionTypes.GZIP)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.gz') with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_after_splitting(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=33)) reference_source_info = (source, None, None) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) source_test_utils.assert_sources_equal_reference_source( reference_source_info, sources_info)
def test_read_empty_single_file_no_eol_gzip(self): file_name, written_data = write_data( 1, no_data=True, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE) gzip_file_name = file_name + '.gz' with open(file_name) as src, gzip.open(gzip_file_name, 'wb') as dst: dst.writelines(src) assert len(written_data) == 1 # written data has a single entry with an empty string. Reading the source # should not produce anything since we only wrote a single empty string # without an end of line character. self._run_read_test(gzip_file_name, [], compression=CompressionTypes.GZIP)
def test_read_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_from_text_single_file_with_coder(self): class DummyCoder(coders.Coder): def encode(self, x): raise ValueError def decode(self, x): return x * 2 file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder()) assert_that(pcoll, equal_to([record * 2 for record in expected_data])) pipeline.run()
def test_progress(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) splits = [split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 fraction_consumed_report = [] range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) for _ in splits[0].source.read(range_tracker): fraction_consumed_report.append(range_tracker.fraction_consumed()) self.assertEqual( [float(i) / 10 for i in range(0, 10)], fraction_consumed_report)
def test_read_corrupted_bzip2_fails(self): _, lines = write_data(15) file_name = self._create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) with open(file_name, 'wb') as f: f.write('corrupt') pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) with self.assertRaises(Exception): pipeline.run()
def test_read_corrupted_gzip_fails(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) with open(file_name, 'wb') as f: f.write('corrupt') pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) with self.assertRaises(Exception): pipeline.run()
def test_read_after_splitting_skip_header(self): file_name, expected_data = write_data(100) assert len(expected_data) == 100 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder(), skip_header_lines=2) splits = list(source.split(desired_bundle_size=33)) reference_source_info = (source, None, None) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) reference_lines = source_test_utils.read_from_source(*reference_source_info) split_lines = [] for source_info in sources_info: split_lines.extend(source_test_utils.read_from_source(*source_info)) self.assertEqual(expected_data[2:], reference_lines) self.assertEqual(reference_lines, split_lines)
def run_sdf_read_pipeline( self, num_files, num_records_per_file, resume_count=None): expected_data = [] file_names = [] for _ in range(num_files): new_file_name, new_expected_data = filebasedsource_test.write_data( num_records_per_file) assert len(new_expected_data) == num_records_per_file file_names.append(new_file_name) expected_data.extend(new_expected_data) assert len(expected_data) > 0 with TestPipeline() as p: pc1 = (p | 'Create1' >> beam.Create(file_names) | 'SDF' >> beam.ParDo(ReadFiles(resume_count))) assert_that(pc1, equal_to(expected_data))
def test_read_single_file_larger_than_default_buffer(self): file_name, expected_data = write_data( TextSource.DEFAULT_READ_BUFFER_SIZE) self._run_read_test(file_name, expected_data, buffer_size=TextSource.DEFAULT_READ_BUFFER_SIZE)
def test_read_single_file_last_line_no_eol(self): file_name, expected_data = write_data( TextSourceTest.DEFAULT_NUM_RECORDS, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE) assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS self._run_read_test(file_name, expected_data)
def test_read_single_file_single_line_no_eol(self): file_name, expected_data = write_data( 1, eol=EOL.LF_WITH_NOTHING_AT_LAST_LINE) assert len(expected_data) == 1 self._run_read_test(file_name, expected_data)
def test_read_reentrant_without_splitting(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder()) source_test_utils.assertReentrantReadsSucceed((source, None, None))
def test_read_single_file_mixed_eol(self): file_name, expected_data = write_data( TextSourceTest.DEFAULT_NUM_RECORDS, eol=EOL.MIXED) assert len(expected_data) == TextSourceTest.DEFAULT_NUM_RECORDS self._run_read_test(file_name, expected_data)