def test_read_auto_bzip2(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_process_auto(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result.gz') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | Create([path]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to(['foo', 'bar']))
def test_process_glob(self): with TempDir() as temp_dir: self._write_glob(temp_dir, 'result') glob = temp_dir.get_path() + os.path.sep + '*result' with TestPipeline() as p: result = (p | Create([glob]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to(['foo', 'bar'] * 3))
def test_read_bzip2(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines))
def test_read_auto_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.gz') with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines))
def test_read_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines))
def test_read_all_gzip(self): _, lines = write_data(100) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = (pipeline | Create([file_name]) | 'ReadAll' >> ReadAllFromText( compression_type=CompressionTypes.GZIP)) assert_that(pcoll, equal_to(lines))
def test_process_gzip(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.GZIP, validate=True)) assert_that(result, equal_to([b'foo', b'bar']))
def test_read_reentrant_after_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) splits = [ split for split in source.split(desired_bundle_size=100000) ] assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_dynamic_work_rebalancing(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) splits = [ split for split in source.split(desired_bundle_size=100000) ] assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position)
def test_file_pattern_verify_details(self): variant_1, vcf_line_1 = self._get_sample_variant_1() variant_2, vcf_line_2 = self._get_sample_variant_2() variant_3, vcf_line_3 = self._get_sample_variant_3() with TempDir() as tempdir: self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + [vcf_line_1], tempdir) self._create_temp_vcf_file((_SAMPLE_HEADER_LINES + [vcf_line_2, vcf_line_3]), tempdir) read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf')) self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
def test_end2end(self): with TempDir() as temp_dir: file_path_prefix = temp_dir.create_temp_file('result') # Generate a TFRecord file. with TestPipeline() as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix) # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '-*') assert_that(actual_data, equal_to(expected_data))
def test_read_deflate(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with open(file_name, 'wb') as f: f.write(zlib.compress('\n'.join(lines).encode('utf-8'))) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.DEFLATE, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip_with_skip_lines(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder(), skip_header_lines=2) assert_that(pcoll, equal_to(lines[2:])) pipeline.run()
def test_end2end_read_write_read(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') with TestPipeline() as p: # Initial read to validate the pipeline doesn't fail before the file is # created. _ = p | ReadFromTFRecord(path + '-*', validate=False) expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( path, file_name_suffix='.gz') # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(path+'-*', validate=True) assert_that(actual_data, equal_to(expected_data))
def test_write_record_multiple(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') record = binascii.a2b_base64(FOO_BAR_RECORD_BASE64) sink = _TFRecordSink( path, coder=coders.BytesCoder(), file_name_suffix='', num_shards=0, shard_name_template=None, compression_type=CompressionTypes.UNCOMPRESSED) self._write_lines(sink, path, [b'foo', b'bar']) with open(path, 'rb') as f: self.assertEqual(f.read(), record)
def test_write_record_auto(self): with TempDir() as temp_dir: file_path_prefix = temp_dir.create_temp_file('result') with TestPipeline() as p: input_data = [b'foo', b'bar'] _ = p | beam.Create(input_data) | WriteToTFRecord( file_path_prefix, file_name_suffix='.gz') actual = [] file_name = glob.glob(file_path_prefix + '-*.gz')[0] for r in tf.python_io.tf_record_iterator( file_name, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP)): actual.append(r) self.assertEqual(actual, input_data)
def test_process_multiple_globs(self): with TempDir() as temp_dir: globs = [] for i in range(3): suffix = 'result' + str(i) self._write_glob(temp_dir, suffix) globs.append(temp_dir.get_path() + os.path.sep + '*' + suffix) with TestPipeline() as p: result = (p | Create(globs) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar'] * 9))
def test_read_corrupted_deflate_fails(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with open(file_name, 'wb') as f: f.write(zlib.compress('\n'.join(lines).encode('utf-8'))) with open(file_name, 'wb') as f: f.write(b'corrupt') with self.assertRaises(Exception): with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.DEFLATE, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines))
def test_read_corrupted_bzip2_fails(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with open(file_name, 'wb') as f: f.write(b'corrupt') pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) with self.assertRaises(Exception): pipeline.run()
def test_read_corrupted_gzip_fails(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write(b'\n'.join(lines)) with open(file_name, 'wb') as f: f.write('corrupt') pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) with self.assertRaises(Exception): pipeline.run()
def test_end2end_example_proto(self): with TempDir() as temp_dir: file_path_prefix = temp_dir.create_temp_file('result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(list(range(3))) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with TestPipeline() as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with TestPipeline() as p: actual_data = (p | ReadFromTFRecord( file_path_prefix + '-*', coder=beam.coders.ProtoCoder(example.__class__))) assert_that(actual_data, equal_to([example]))
def test_pubsub_to_gcs(): PubSubToGCS.run( input_topic="unused", # mocked by TestStream output_path="gs://{}/pubsub/{}/output".format(BUCKET, UUID), window_size=1, # 1 minute pipeline_args=[ "--project", PROJECT, "--temp_location", TempDir().get_path(), ], ) # Check for output files on GCS. gcs_client = beam.io.gcp.gcsio.GcsIO() files = gcs_client.list_prefix("gs://{}/pubsub/{}".format(BUCKET, UUID)) assert len(files) > 0 # Clean up. gcs_client.delete_batch(list(files))
def test_read_bzip2_concat(self): with TempDir() as tempdir: bzip2_file_name1 = tempdir.create_temp_file() lines = ['a', 'b', 'c'] with bz2.BZ2File(bzip2_file_name1, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data.encode('utf-8')) bzip2_file_name2 = tempdir.create_temp_file() lines = ['p', 'q', 'r'] with bz2.BZ2File(bzip2_file_name2, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data.encode('utf-8')) bzip2_file_name3 = tempdir.create_temp_file() lines = ['x', 'y', 'z'] with bz2.BZ2File(bzip2_file_name3, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data.encode('utf-8')) final_bzip2_file = tempdir.create_temp_file() with open(bzip2_file_name1, 'rb') as src, open( final_bzip2_file, 'wb') as dst: dst.writelines(src.readlines()) with open(bzip2_file_name2, 'rb') as src, open( final_bzip2_file, 'ab') as dst: dst.writelines(src.readlines()) with open(bzip2_file_name3, 'rb') as src, open( final_bzip2_file, 'ab') as dst: dst.writelines(src.readlines()) pipeline = TestPipeline() lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText( final_bzip2_file, compression_type=beam.io.filesystem.CompressionTypes.BZIP2) expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z'] assert_that(lines, equal_to(expected)) pipeline.run()
def test_read_deflate_concat(self): with TempDir() as tempdir: deflate_file_name1 = tempdir.create_temp_file() lines = ['a', 'b', 'c'] with open(deflate_file_name1, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(zlib.compress(data.encode('utf-8'))) deflate_file_name2 = tempdir.create_temp_file() lines = ['p', 'q', 'r'] with open(deflate_file_name2, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(zlib.compress(data.encode('utf-8'))) deflate_file_name3 = tempdir.create_temp_file() lines = ['x', 'y', 'z'] with open(deflate_file_name3, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(zlib.compress(data.encode('utf-8'))) final_deflate_file = tempdir.create_temp_file() with open(deflate_file_name1, 'rb') as src, \ open(final_deflate_file, 'wb') as dst: dst.writelines(src.readlines()) with open(deflate_file_name2, 'rb') as src, \ open(final_deflate_file, 'ab') as dst: dst.writelines(src.readlines()) with open(deflate_file_name3, 'rb') as src, \ open(final_deflate_file, 'ab') as dst: dst.writelines(src.readlines()) with TestPipeline() as pipeline: lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText( final_deflate_file, compression_type=beam.io.filesystem.CompressionTypes. DEFLATE) expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z'] assert_that(lines, equal_to(expected))
def test_pubsub_to_gcs(): PubSubToGCS.run( input_topic="unused", # mocked by TestStream output_path=f"gs://{BUCKET}/pubsub/{UUID}/output", window_size=1, # 1 minute num_shards=1, pipeline_args=[ "--project", PROJECT, "--temp_location", TempDir().get_path(), ], ) # Check for output files on GCS. gcs_client = GcsIO() files = gcs_client.list_prefix(f"gs://{BUCKET}/pubsub/{UUID}") assert len(files) > 0 # Clean up. gcs_client.delete_batch(list(files))
def test_read_gzip_large_after_splitting(self): _, lines = write_data(10000) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) source = TextSource(file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) splits = list(source.split(desired_bundle_size=1000)) if len(splits) > 1: raise ValueError('FileBasedSource generated more than one initial ' 'split for a compressed file.') reference_source_info = (source, None, None) sources_info = ([ (split.source, split.start_position, split.stop_position) for split in splits]) source_test_utils.assert_sources_equal_reference_source( reference_source_info, sources_info)
def test_read_gzip_concat(self): with TempDir() as tempdir: gzip_file_name1 = tempdir.create_temp_file() lines = ['a', 'b', 'c'] with gzip.open(gzip_file_name1, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data) gzip_file_name2 = tempdir.create_temp_file() lines = ['p', 'q', 'r'] with gzip.open(gzip_file_name2, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data) gzip_file_name3 = tempdir.create_temp_file() lines = ['x', 'y', 'z'] with gzip.open(gzip_file_name3, 'wb') as dst: data = '\n'.join(lines) + '\n' dst.write(data) final_gzip_file = tempdir.create_temp_file() with open(gzip_file_name1, 'rb') as src, \ open(final_gzip_file, 'wb') as dst: dst.writelines(src.readlines()) with open(gzip_file_name2, 'rb') as src, \ open(final_gzip_file, 'ab') as dst: dst.writelines(src.readlines()) with open(gzip_file_name3, 'rb') as src, \ open(final_gzip_file, 'ab') as dst: dst.writelines(src.readlines()) pipeline = TestPipeline() lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText( final_gzip_file, compression_type=beam.io.filesystem.CompressionTypes.GZIP) expected = ['a', 'b', 'c', 'p', 'q', 'r', 'x', 'y', 'z'] assert_that(lines, equal_to(expected))
def _create_temp_file_and_read_records(self, lines): with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines) return self._read_records(file_name)
def test_read_reentrant_without_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) source_test_utils.assert_reentrant_reads_succeed((source, None, None))