def test_pcolls_to_pcoll_id(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Impulse() _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True) self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx), {str(init_pcoll): 'ref_PCollection_PCollection_1'})
def source(self, *labels): """Returns the StreamingCacheManager source. This is beam.Impulse() because unbounded sources will be marked with this and then the PipelineInstrument will replace these with a TestStream. """ return beam.Impulse()
def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
def test_pcoll_to_pcoll_id(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) ie.current_env().set_cache_manager(InMemoryCache(), p) # pylint: disable=bad-option-value init_pcoll = p | 'Init Create' >> beam.Impulse() _, ctx = p.to_runner_api(return_context=True) self.assertEqual(instr.pcoll_to_pcoll_id(p, ctx), {str(init_pcoll): 'ref_PCollection_PCollection_1'})
def expand(self, pbegin): assert isinstance(pbegin, pvalue.PBegin), ( 'Input to transform must be a PBegin but found %s' % pbegin) return ( pbegin | 'Impulse' >> beam.Impulse() | 'GenerateKeys' >> beam.ParDo( StatefulLoadGenerator.GenerateKeys(self.num_keys, self.key_size)) | 'GenerateLoad' >> beam.ParDo( StatefulLoadGenerator.GenerateLoad( self.num_records // self.num_keys, self.value_size)))
def build_write_pipeline(self, pipeline): _ = ( pipeline | 'Impulse' >> beam.Impulse() | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS)) # pylint: disable=range-builtin-not-iterating | 'Reshuffle' >> beam.Reshuffle() | 'MakeKV' >> beam.Map(lambda x: (b'', str(x).encode())).with_output_types( typing.Tuple[bytes, bytes]) | 'WriteToKafka' >> WriteToKafka( producer_config={'bootstrap.servers': self.bootstrap_servers}, topic=self.topic, expansion_service=self.expansion_service))
def main(options): with beam.Pipeline(options=options) as pipe: (pipe | 'Impulse' >> beam.Impulse() | 'LocalEnv' >> GetEnv('FOO') | 'Local' >> Log(color='green') | 'RemoteEnv' >> EnvTransform(GetEnv, {'FOO': 'BAR'}, 'FOO') | 'Remote' >> Log(color='cyan') | 'AssertEnv' >> EnvTransform(AssertEnv, {'FOO': 'BAR'}, FOO='BAR') | 'ClsLvlNeedsEnv' >> NeedsEnv() # TODO: # | 'InstLvlNeedsEnv' >> NeedsEnv().with_env(FOO='BAZ') )
def run_write_pipeline( self, num_rows, to_row_fn, row_type, spanner_transform=None): with TestPipeline(is_integration_test=True) as p: p.not_use_test_runner_api = True _ = ( p | 'Impulse' >> beam.Impulse() | 'Generate' >> beam.FlatMap(lambda x: range(num_rows)) # pylint: disable=range-builtin-not-iterating | 'Map to row' >> beam.Map(to_row_fn).with_output_types(row_type) | 'Write to Spanner' >> spanner_transform( instance_id=self.instance_id, database_id=self.database_id, project_id=self.project_id, table=self.table, emulator_host=self.spanner_helper.get_emulator_host(), ))
def send(pipeline_options): with beam.Pipeline(options=pipeline_options) as pipe: (pipe | beam.Impulse() | beam.Map(lambda x: (1, x)) | Log() # | 'KafkaWrite' >> WriteToKafka( # producer_config={ # 'bootstrap.servers': 'localhost:9092', # }, # topic=TOPIC, # # key_serializer='org.apache.kafka.common.serialization.ByteArraySerializer', # # value_serializer='org.apache.kafka.common.serialization.ByteArraySerializer', # expansion_service='localhost:8097', # ) )
def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> beam.Impulse() pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5}, set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(2, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[1].transform, transform) self.assertEqual(visitor.leave_composite[0].transform, transform)
def test_pardo_with_unbounded_per_element_dofn(self): class UnboundedDoFn(beam.DoFn): @beam.DoFn.unbounded_per_element() def process(self, element): pass class BoundedDoFn(beam.DoFn): def process(self, element): pass with TestPipeline() as p: source = p | beam.Impulse() unbounded_pcoll = source | beam.ParDo(UnboundedDoFn()) bounded_pcoll = source | beam.ParDo(BoundedDoFn()) self.assertEqual(unbounded_pcoll.is_bounded, False) self.assertEqual(bounded_pcoll.is_bounded, True)
def run_kinesis_write(self): with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: p.not_use_test_runner_api = True _ = ( p | 'Impulse' >> beam.Impulse() | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS)) # pylint: disable=bad-option-value | 'Map to bytes' >> beam.Map(lambda x: RECORD + str(x).encode( )).with_output_types(bytes) | 'WriteToKinesis' >> WriteToKinesis( stream_name=self.aws_kinesis_stream, aws_access_key=self.aws_access_key, aws_secret_key=self.aws_secret_key, region=self.aws_region, service_endpoint=self.aws_service_endpoint, verify_certificate=(not self.use_localstack), partition_key='1', producer_properties=self.producer_properties, ))
def test_schema_autodetect_not_allowed_with_avro_file_loads(self): with TestPipeline() as p: pc = p | beam.Impulse() with self.assertRaisesRegex(ValueError, '^A schema must be provided'): _ = ( pc | 'No Schema' >> beam.io.gcp.bigquery.WriteToBigQuery( "dataset.table", schema=None, temp_file_format=bigquery_tools.FileFormat.AVRO)) with self.assertRaisesRegex(ValueError, '^Schema auto-detection is not supported'): _ = ( pc | 'Schema Autodetected' >> beam.io.gcp.bigquery.WriteToBigQuery( "dataset.table", schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, temp_file_format=bigquery_tools.FileFormat.AVRO))
def test_passthrough(self): """ Test that PTransforms which pass through their input PCollection can be used with PipelineInfo. """ class Passthrough(beam.PTransform): def expand(self, pcoll): return pcoll p = beam.Pipeline(runner=self.runner) p | beam.Impulse() | Passthrough() # pylint: disable=expression-not-assigned proto = to_stable_runner_api(p).components info = pipeline_analyzer.PipelineInfo(proto) for pcoll_id in info.all_pcollections(): # FIXME: If PipelineInfo does not support passthrough PTransforms, this # will only fail some of the time, depending on the ordering of # transforms in the Pipeline proto. # Should not throw exception info.cache_label(pcoll_id)
def test_root_transforms(self): root_read = beam.Impulse() root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertCountEqual(root_transforms, [root_read, root_flatten]) pbegin_consumers = [ c.transform for c in self.visitor.value_to_consumers[pbegin] ] self.assertCountEqual(pbegin_consumers, [root_read]) self.assertEqual(len(self.visitor.step_names), 3)
def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element def _process_numbers(pcoll, negatives): first_output = (pcoll | 'process numbers step 1' >> ParDo( ProcessNumbersFn(), negatives)) second_output = (first_output | 'process numbers step 2' >> ParDo( ProcessNumbersFn(), negatives)) output_pc = ((first_output, second_output) | 'flatten results' >> beam.Flatten()) return output_pc root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result _process_numbers(positive, AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 5) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
def test_pardo_state_with_custom_key_coder(self): """Tests that state requests work correctly when the key coder is an SDK-specific coder, i.e. non standard coder. This is additionally enforced by Java's ProcessBundleDescriptorsTest and by Flink's ExecutableStageDoFnOperator which detects invalid encoding by checking for the correct key group of the encoded key.""" index_state_spec = userstate.CombiningValueStateSpec('index', sum) # Test params # Ensure decent amount of elements to serve all partitions n = 200 duplicates = 1 split = n // (duplicates + 1) inputs = [(i % split, str(i % split)) for i in range(0, n)] # Use a DoFn which has to use FastPrimitivesCoder because the type cannot # be inferred class Input(beam.DoFn): def process(self, impulse): for i in inputs: yield i class AddIndex(beam.DoFn): def process(self, kv, index=beam.DoFn.StateParam(index_state_spec)): k, v = kv index.add(1) yield k, v, index.read() expected = [(i % split, str(i % split), i // split + 1) for i in range(0, n)] with self.create_pipeline() as p: assert_that( p | beam.Impulse() | beam.ParDo(Input()) | beam.ParDo(AddIndex()), equal_to(expected))
def run_write(self): def user_data_mapper(test_row): return [ str(test_row.number_column).encode('utf-8'), str(test_row.boolean_column).encode('utf-8'), binascii.hexlify(test_row.bytes_column), ] with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: p.not_use_test_runner_api = True _ = ( p | 'Impulse' >> beam.Impulse() | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS)) # pylint: disable=range-builtin-not-iterating | 'Map to TestRow' >> beam.Map( lambda num: TestRow( num, num % 2 == 0, b"test" + str(num).encode())) | WriteToSnowflake( server_name=self.server_name, username=self.username, password=self.password, o_auth_token=self.o_auth_token, private_key_path=self.private_key_path, raw_private_key=self.raw_private_key, private_key_passphrase=self.private_key_passphrase, schema=self.schema, database=self.database, role=self.role, warehouse=self.warehouse, staging_bucket_name=self.staging_bucket_name, storage_integration_name=self.storage_integration_name, create_disposition=CreateDisposition.CREATE_IF_NEEDED, write_disposition=WriteDisposition.TRUNCATE, table_schema=SCHEMA_STRING, user_data_mapper=user_data_mapper, table=self.table, query=None, expansion_service=self.expansion_service, ))
def test(self): class SequenceSideInputTestDoFn(beam.DoFn): """Iterate over first n side_input elements.""" def __init__(self, first_n): self._first_n = first_n def process(self, unused_element, side_input): i = 0 it = iter(side_input) while i < self._first_n: i += 1 try: # No-op. We only make sure that the element is accessed. next(it) except StopIteration: return class MappingSideInputTestDoFn(beam.DoFn): """Take a sequence of keys as an additional side input and for each key in the sequence checks the value for key in the dictionary.""" def process(self, unused_element, dict_side_input, keys_to_check): for key in keys_to_check: # No-op. We only make sure that the element is accessed. dict_side_input[key] class GetRandomKeys(beam.DoFn): def __init__(self, n): self._n = n def process(self, unused_element, dict_side_input): import random n = min(self._n, len(dict_side_input)) return random.sample(dict_side_input.keys(), n) class AddEventTimestamps(beam.DoFn): """Assign timestamp to each element of PCollection.""" def setup(self): self._timestamp = 0 def process(self, element): from apache_beam.transforms.combiners import window yield window.TimestampedValue(element, self._timestamp) self._timestamp += 1 input_pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Collect start time metrics' >> beam.ParDo( MeasureTime(self.metrics_namespace))) if self.side_input_size != self.input_options.get('num_records'): side_input = ( input_pc | 'Sample {} elements'.format(self.side_input_size) >> beam.combiners.Sample.FixedSizeGlobally(self.side_input_size) | 'Flatten a sequence' >> beam.FlatMap(lambda x: x)) else: side_input = input_pc if self.windows > 0: window_size = self.side_input_size / self.windows logging.info('Fixed windows of %s seconds will be applied', window_size) side_input = ( side_input | 'Add event timestamps' >> beam.ParDo(AddEventTimestamps()) | 'Apply windows' >> beam.WindowInto( beam.combiners.window.FixedWindows(window_size))) side_input_type = self.materialize_as() elements_to_access = self.side_input_size * self.access_percentage // 100 logging.info( '%s out of %s total elements in the side input will be ' 'accessed.', elements_to_access, self.side_input_size) if side_input_type is beam.pvalue.AsDict: random_keys = (self.pipeline | beam.Impulse() | 'Get random keys' >> beam.ParDo( GetRandomKeys(elements_to_access), beam.pvalue.AsDict(side_input))) pc = input_pc | beam.ParDo(MappingSideInputTestDoFn(), side_input_type(side_input), beam.pvalue.AsList(random_keys)) else: pc = input_pc | beam.ParDo( SequenceSideInputTestDoFn(elements_to_access), side_input_type(side_input)) _ = pc | 'Collect end time metrics' >> beam.ParDo( MeasureTime(self.metrics_namespace))
def test_impulse(self): with test_pipeline.TestPipeline(runner='BundleBasedDirectRunner') as p: assert_that(p | beam.Impulse(), equal_to([b'']))
def run(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True with Pipeline(options=options) as p: (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))