def test(self): def join_fn(element, side_input, iterations): result = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: result.append({key: element[1] + value}) yield result main_input = ( self.pipeline | "Read pcoll 1" >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) side_input = ( self.pipeline | "Read pcoll 2" >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned ( main_input | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
def test(self): class CounterOperation(beam.DoFn): def __init__(self, number_of_counters, number_of_operations): self.number_of_operations = number_of_operations self.counters = [] for i in range(number_of_counters): self.counters.append( Metrics.counter('do-not-publish', 'name-{}'.format(i))) def process(self, element): for _ in range(self.number_of_operations): for counter in self.counters: counter.inc() yield element pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for i in range(self.iterations): pc = (pc | 'Step: %d' % i >> beam.ParDo( CounterOperation(self.number_of_counters, self.number_of_operations))) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part import base64 return {'data': base64.b64encode(record[1])} with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.input_dataset, table=self.input_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def test(self): if self.get_option_or_default('use_stateful_load_generator', False): source = ( self.pipeline | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options) | beam.ParDo(AssignTimestamps()) | beam.WindowInto(window.FixedWindows(20))) else: source = ( self.pipeline | 'Read synthetic' >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options()))) pc = (source | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for branch in range(self.fanout): ( # pylint: disable=expression-not-assigned pc | 'Combine with Top %i' % branch >> beam.CombineGlobally( beam.combiners.TopCombineFn( self.top_count)).without_defaults() | 'Consume %i' % branch >> beam.ParDo(self._GetElement()) | 'Measure time: End %i' % branch >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def test(self): def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid # pylint: disable=reimported from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.TEST_DATABASE, max_batch_size_bytes=5120))
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test_data', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.spanner_database, max_batch_size_bytes=5120))
def test(self): class CounterOperation(beam.DoFn): def __init__(self, number_of_counters, number_of_operations): self.number_of_operations = number_of_operations self.counters = [] for i in range(number_of_counters): self.counters.append( Metrics.counter('do-not-publish', 'name-{}'.format(i))) state_param = beam.DoFn.StateParam( userstate.CombiningValueStateSpec( 'count', beam.coders.IterableCoder(beam.coders.VarIntCoder()), sum)) if self.stateful else None def process(self, element, state=state_param): for _ in range(self.number_of_operations): for counter in self.counters: counter.inc() if state: state.add(1) yield element if self.get_option_or_default('streaming', False): source = ( self.pipeline | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options)) else: source = ( self.pipeline | 'Read synthetic' >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options()))) pc = ( source | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Assign timestamps' >> beam.ParDo(AssignTimestamps())) for i in range(self.iterations): pc = ( pc | 'Step: %d' % i >> beam.ParDo( CounterOperation( self.number_of_counters, self.number_of_operations))) # pylint: disable=expression-not-assigned ( pc | 'Measure latency' >> beam.ParDo(MeasureLatency(self.metrics_namespace)) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
def test(self): input = (self.pipeline | beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for branch in range(self.fanout): ( # pylint: disable=expression-not-assigned input | 'Combine with Top %i' % branch >> beam.CombineGlobally( beam.combiners.TopCombineFn(self.top_count)) | 'Consume %i' % branch >> beam.ParDo(self._GetElement()) | 'Measure time: End %i' % branch >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def test(self): pc = (self.pipeline | beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for branch in range(self.fanout): ( # pylint: disable=expression-not-assigned pc | 'GroupByKey %i' % branch >> beam.GroupByKey() | 'Ungroup %i' % branch >> beam.ParDo( self._UngroupAndReiterate(), self.iterations) | 'Measure time: End %i' % branch >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} # pylint: disable=expression-not-assigned (self.pipeline | 'ProduceRows' >> Read( SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'WriteToBigQuery' >> WriteToBigQuery( self.output_dataset + '.' + self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def test(self): def to_pubsub_message(element): import uuid from apache_beam.io import PubsubMessage return PubsubMessage( data=element[1], attributes={'id': str(uuid.uuid1()).encode('utf-8')}, ) _ = ( self.pipeline | 'Create input' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Pubsub' >> beam.io.WriteToPubSub( self.topic_name, with_attributes=True, id_label='id', ))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
def test(self): class SequenceSideInputTestDoFn(beam.DoFn): """Iterate over first n side_input elements.""" def __init__(self, first_n): self._first_n = first_n def process(self, unused_element, side_input): i = 0 it = iter(side_input) while i < self._first_n: i += 1 try: # No-op. We only make sure that the element is accessed. next(it) except StopIteration: return class MappingSideInputTestDoFn(beam.DoFn): """Take a sequence of keys as an additional side input and for each key in the sequence checks the value for key in the dictionary.""" def process(self, unused_element, dict_side_input, keys_to_check): for key in keys_to_check: # No-op. We only make sure that the element is accessed. dict_side_input[key] class GetRandomKeys(beam.DoFn): def __init__(self, n): self._n = n def process(self, unused_element, dict_side_input): import random n = min(self._n, len(dict_side_input)) return random.sample(dict_side_input.keys(), n) class AddEventTimestamps(beam.DoFn): """Assign timestamp to each element of PCollection.""" def setup(self): self._timestamp = 0 def process(self, element): from apache_beam.transforms.combiners import window yield window.TimestampedValue(element, self._timestamp) self._timestamp += 1 input_pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Collect start time metrics' >> beam.ParDo( MeasureTime(self.metrics_namespace))) if self.side_input_size != self.input_options.get('num_records'): side_input = ( input_pc | 'Sample {} elements'.format(self.side_input_size) >> beam.combiners.Sample.FixedSizeGlobally(self.side_input_size) | 'Flatten a sequence' >> beam.FlatMap(lambda x: x)) else: side_input = input_pc if self.windows > 0: window_size = self.side_input_size / self.windows logging.info('Fixed windows of %s seconds will be applied', window_size) side_input = ( side_input | 'Add event timestamps' >> beam.ParDo(AddEventTimestamps()) | 'Apply windows' >> beam.WindowInto( beam.combiners.window.FixedWindows(window_size))) side_input_type = self.materialize_as() elements_to_access = self.side_input_size * self.access_percentage // 100 logging.info( '%s out of %s total elements in the side input will be ' 'accessed.', elements_to_access, self.side_input_size) if side_input_type is beam.pvalue.AsDict: random_keys = (self.pipeline | beam.Impulse() | 'Get random keys' >> beam.ParDo( GetRandomKeys(elements_to_access), beam.pvalue.AsDict(side_input))) pc = input_pc | beam.ParDo(MappingSideInputTestDoFn(), side_input_type(side_input), beam.pvalue.AsList(random_keys)) else: pc = input_pc | beam.ParDo( SequenceSideInputTestDoFn(elements_to_access), side_input_type(side_input)) _ = pc | 'Collect end time metrics' >> beam.ParDo( MeasureTime(self.metrics_namespace))