def testCoGroupByKey(self): with self.pipeline as p: pc1 = (p | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.inputOptions))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) ) pc2 = (p | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.coInputOptions))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map( lambda x: (x, x)) ) # pylint: disable=expression-not-assigned ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2} | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time' >> beam.ParDo(MeasureTime()) ) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def test(self): pc1 = (self.pipeline | 'Read ' + self.INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parse_synthetic_source_options())) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) pc2 = (self.pipeline | 'Read ' + self.CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parse_synthetic_source_options( self.co_input_options))) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned ({ self.INPUT_TAG: pc1, self.CO_INPUT_TAG: pc2 } | 'CoGroupByKey ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo( self._UngroupAndReiterate(self.INPUT_TAG, self.CO_INPUT_TAG), self.iterations) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def testCoGroupByKey(self): pc1 = (self.pipeline | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.input_options))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) pc2 = ( self.pipeline | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.co_input_options))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned ({ INPUT_TAG: pc1, CO_INPUT_TAG: pc2 } | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1] + value}) yield list main_input = (self.pipeline | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) side_input = ( self.pipeline | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource(self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
def testCoGroupByKey(self): pc1 = (self.pipeline | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.input_options))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) pc2 = (self.pipeline | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.co_input_options))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map( lambda x: (x, x)) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2} | 'CoGroupByKey ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._UngroupAndReiterate(), self.iterations) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) )
def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1] + value}) yield list with self.pipeline as p: main_input = (p | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) side_input = ( p | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource(self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def testParDo(self): class CounterOperation(beam.DoFn): def __init__(self, number_of_counters, number_of_operations): self.number_of_operations = number_of_operations self.counters = [] for i in range(number_of_counters): self.counters.append(Metrics.counter('do-not-publish', 'name-{}'.format(i))) def process(self, element): for _ in range(self.number_of_operations): for counter in self.counters: counter.inc() yield element pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions() )) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) for i in range(self.iterations): pc = (pc | 'Step: %d' % i >> beam.ParDo( CounterOperation(self.number_of_counters, self.number_of_operations)) ) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) )
def testParDo(self): if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) with self.pipeline as p: pc = (p | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time' >> beam.ParDo(MeasureTime())) for i in range(num_runs): label = 'Step: %d' % i pc = (pc | label >> beam.ParDo(self._GetElement())) if self.output is not None: # pylint: disable=expression-not-assigned (pc | "Write" >> beam.io.WriteToText(self.output)) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def testParDo(self): class _GetElement(beam.DoFn): from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes @count_bytes def process(self, element, namespace, is_returning): if is_returning: yield element if not self.iterations: num_runs = 1 else: num_runs = int(self.iterations) pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for i in range(num_runs): is_returning = (i == (num_runs - 1)) pc = (pc | 'Step: %d' % i >> beam.ParDo( _GetElement(), self.metrics_namespace, is_returning)) if self.output: pc = (pc | "Write" >> beam.io.WriteToText(self.output)) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def test_synthetic_source_split_uneven(self): source = synthetic_pipeline.SyntheticSource( input_spec(1000, 1, 1, 'zipf', 3, 10)) splits = source.split(100) sources_info = [(split.source, split.start_position, split.stop_position) for split in splits] self.assertEqual(10, len(sources_info)) source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info)
def test_synthetic_source(self): def assert_size(element, expected_size): assert len(element) == expected_size with beam.Pipeline() as p: pcoll = ( p | beam.io.Read( synthetic_pipeline.SyntheticSource(input_spec(300, 5, 15)))) (pcoll | beam.Map(lambda elm: elm[0]) | 'key' >> beam.Map(assert_size, 5)) (pcoll | beam.Map(lambda elm: elm[1]) | 'value' >> beam.Map(assert_size, 15)) assert_that(pcoll | beam.combiners.Count.Globally(), equal_to([300]))
def testGroupByKey(self): input = (self.pipeline | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for branch in range(self.fanout): # pylint: disable=expression-not-assigned (input | 'GroupByKey %i' % branch >> beam.GroupByKey() | 'Ungroup %i' % branch >> beam.ParDo(self._UngroupAndReiterate(), self.iterations) | 'Measure time: End %i' % branch >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def testCombineGlobally(self): input = (self.pipeline | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for branch in range(self.fanout): # pylint: disable=expression-not-assigned (input | 'Combine with Top %i' % branch >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume %i' % branch >> beam.ParDo(self._GetElement()) | 'Measure time: End %i' % branch >> beam.ParDo( MeasureTime(self.metrics_namespace)))
def testParDo(self): class _GetElement(beam.DoFn): from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes @count_bytes(COUNTER_LABEL) def process(self, element, namespace, is_returning): if is_returning: yield element if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) with self.pipeline as p: pc = (p | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions() )) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) for i in range(num_runs): is_returning = (i == (num_runs-1)) pc = (pc | 'Step: %d' % i >> beam.ParDo( _GetElement(), self.metrics_namespace, is_returning) ) if self.output is not None: pc = (pc | "Write" >> beam.io.WriteToText(self.output) ) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def testCombineGlobally(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time' >> beam.ParDo(MeasureTime()) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) ) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def testGroupByKey(self): # pylint: disable=expression-not-assigned (self.pipeline | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def testCombineGlobally(self): # pylint: disable=expression-not-assigned (self.pipeline | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time' >> beam.ParDo(MeasureTime()) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def test_split_at_fraction(self): source = synthetic_pipeline.SyntheticSource(input_spec(10, 1, 1)) source_test_utils.assert_split_at_fraction_exhaustive(source) source_test_utils.assert_split_at_fraction_fails(source, 5, 0.3) source_test_utils.assert_split_at_fraction_succeeds_and_consistent( source, 1, 0.3)