def test_using_fast_impl(self): try: utils.check_compiled('apache_beam.coders') except RuntimeError: self.skipTest('Cython is not installed') # pylint: disable=wrong-import-order, wrong-import-position # pylint: disable=unused-variable import apache_beam.coders.stream
counter_factory = CounterFactory() state_sampler = statesampler.StateSampler('basic', counter_factory) state_sampler.start() with state_sampler.scoped_state('step1', 'state'): si_counter = opcounters.SideInputReadCounter( counter_factory, state_sampler, 'step1', 1) si_counter = opcounters.NoOpTransformIOCounter() sources = [ FakeSource(long_generator(i, input_per_source)) for i in range(num_sources)] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) start = time.time() list(iterator_fn()) time_cost = time.time() - start times.append(time_cost) state_sampler.stop() print("Runtimes:", times) avg_runtime = sum(times) / len(times) print("Average runtime:", avg_runtime) print("Time per element:", avg_runtime / (input_per_source * num_sources)) if __name__ == '__main__': utils.check_compiled( 'apache_beam.runners.worker.opcounters') run_benchmark()
for i in range(num_input): values.append(random.randint(lower_bound, upper_bound)) return values def run_benchmark(num_runs=100, num_input=10000, seed=time.time()): total_time = 0 random.seed(seed) lower_bound = 0 upper_bound = sys.maxsize inputs = generate_input_values(num_input, lower_bound, upper_bound) from apache_beam.transforms import DataflowDistributionCounter print("Number of runs:", num_runs) print("Input size:", num_input) print("Input sequence from %d to %d" % (lower_bound, upper_bound)) print("Random seed:", seed) for i in range(num_runs): counter = DataflowDistributionCounter() start = time.time() counter.add_inputs_for_test(inputs) time_cost = time.time() - start print("Run %d: Total time cost %g sec" % (i+1, time_cost)) total_time += time_cost / num_input print("Per element update time cost:", total_time / num_runs) if __name__ == '__main__': utils.check_compiled( 'apache_beam.transforms.cy_dataflow_distribution_counter') run_benchmark()
coders.GlobalWindowCoder()), globally_windowed_value), coder_benchmark_factory( coders.LengthPrefixCoder(coders.FastPrimitivesCoder()), small_int) ] suite = [utils.BenchmarkConfig(b, input_size, num_runs) for b in benchmarks if re.search(filter_regex, b.__name__, flags=re.I)] utils.run_benchmarks(suite, verbose=verbose) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--filter', default='.*') parser.add_argument('--num_runs', default=20, type=int) parser.add_argument('--num_elements_per_benchmark', default=1000, type=int) parser.add_argument('--seed', default=42, type=int) options = parser.parse_args() utils.check_compiled("apache_beam.coders.coder_impl") num_runs = 20 num_elements_per_benchmark = 1000 seed = 42 # Fix the seed for better consistency run_coder_benchmarks( options.num_runs, options.num_elements_per_benchmark, options.seed, verbose=True, filter_regex=options.filter)
def setUp(self): try: utils.check_compiled('apache_beam.coders') except RuntimeError: self.skipTest('Cython is not installed')
for i in range(num_runs): counter_factory = CounterFactory() state_sampler = statesampler.StateSampler('basic', counter_factory) state_sampler.start() with state_sampler.scoped_state('step1', 'state'): si_counter = opcounters.SideInputReadCounter( counter_factory, state_sampler, 'step1', 1) si_counter = opcounters.NoOpTransformIOCounter() sources = [ FakeSource(long_generator(i, input_per_source)) for i in range(num_sources) ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, read_counter=si_counter) start = time.time() list(iterator_fn()) time_cost = time.time() - start times.append(time_cost) state_sampler.stop() print("Runtimes:", times) avg_runtime = sum(times) // len(times) print("Average runtime:", avg_runtime) print("Time per element:", avg_runtime // (input_per_source * num_sources)) if __name__ == '__main__': utils.check_compiled('apache_beam.runners.worker.opcounters') run_benchmark()
import apache_beam as beam from apache_beam.tools import utils from scipy import stats def run_benchmark(num_maps=100, num_runs=10, num_elements_step=1000): timings = {} for run in range(num_runs): num_elements = num_elements_step * run + 1 start = time.time() with beam.Pipeline() as p: pc = p | beam.Create(list(range(num_elements))) for ix in range(num_maps): pc = pc | 'Map%d' % ix >> beam.FlatMap(lambda x: (None,)) timings[num_elements] = time.time() - start print("%6d element%s %g sec" % ( num_elements, " " if num_elements == 1 else "s", timings[num_elements])) print() # pylint: disable=unused-variable gradient, intercept, r_value, p_value, std_err = stats.linregress( *list(zip(*list(timings.items())))) print("Fixed cost ", intercept) print("Per-element ", gradient / num_maps) print("R^2 ", r_value**2) if __name__ == '__main__': utils.check_compiled('apache_beam.runners.common') run_benchmark()
from apache_beam.tools import utils from scipy import stats def run_benchmark(num_maps=100, num_runs=10, num_elements_step=1000): timings = {} for run in range(num_runs): num_elements = num_elements_step * run + 1 start = time.time() with beam.Pipeline() as p: pc = p | beam.Create(list(range(num_elements))) for ix in range(num_maps): pc = pc | 'Map%d' % ix >> beam.FlatMap(lambda x: (None, )) timings[num_elements] = time.time() - start print("%6d element%s %g sec" % (num_elements, " " if num_elements == 1 else "s", timings[num_elements])) print() # pylint: disable=unused-variable gradient, intercept, r_value, p_value, std_err = stats.linregress( *list(zip(*list(timings.items())))) print("Fixed cost ", intercept) print("Per-element ", gradient / num_maps) print("R^2 ", r_value**2) if __name__ == '__main__': utils.check_compiled('apache_beam.runners.common') run_benchmark()
values.append(random.randint(lower_bound, upper_bound)) return values def run_benchmark(num_runs=100, num_input=10000, seed=time.time()): total_time = 0 random.seed(seed) lower_bound = 0 upper_bound = sys.maxsize inputs = generate_input_values(num_input, lower_bound, upper_bound) from apache_beam.transforms import DataflowDistributionCounter print("Number of runs:", num_runs) print("Input size:", num_input) print("Input sequence from %d to %d" % (lower_bound, upper_bound)) print("Random seed:", seed) for i in range(num_runs): counter = DataflowDistributionCounter() start = time.time() counter.add_inputs_for_test(inputs) time_cost = time.time() - start print("Run %d: Total time cost %g sec" % (i + 1, time_cost)) total_time += time_cost / num_input print("Per element update time cost:", total_time / num_runs) if __name__ == '__main__': logging.basicConfig() utils.check_compiled( 'apache_beam.transforms.cy_dataflow_distribution_counter') run_benchmark()
# pylint: disable=unused-variable for i in range(num_input): values.append(random.randint(lower_bound, upper_bound)) return values def run_benchmark(num_runs=100, num_input=10000, seed=time.time()): total_time = 0 random.seed(seed) lower_bound = 0 upper_bound = sys.maxint inputs = generate_input_values(num_input, lower_bound, upper_bound) print("Number of runs:", num_runs) print("Input size:", num_input) print("Input sequence from %d to %d" % (lower_bound, upper_bound)) print("Random seed:", seed) for i in range(num_runs): counter = DistributionAccumulator() start = time.time() for value in inputs: counter.add_input(value) time_cost = time.time() - start print("Run %d: Total time cost %g sec" % (i + 1, time_cost)) total_time += time_cost / num_input print("Per element update time cost:", total_time / num_runs) if __name__ == '__main__': utils.check_compiled('apache_beam.transforms.cy_combiners') run_benchmark()