Beispiel #1
0
 def test_using_fast_impl(self):
   try:
     utils.check_compiled('apache_beam.coders')
   except RuntimeError:
     self.skipTest('Cython is not installed')
   # pylint: disable=wrong-import-order, wrong-import-position
   # pylint: disable=unused-variable
   import apache_beam.coders.stream
    counter_factory = CounterFactory()
    state_sampler = statesampler.StateSampler('basic', counter_factory)
    state_sampler.start()
    with state_sampler.scoped_state('step1', 'state'):
      si_counter = opcounters.SideInputReadCounter(
          counter_factory, state_sampler, 'step1', 1)
      si_counter = opcounters.NoOpTransformIOCounter()
      sources = [
          FakeSource(long_generator(i, input_per_source))
          for i in range(num_sources)]
      iterator_fn = sideinputs.get_iterator_fn_for_sources(
          sources, read_counter=si_counter)
      start = time.time()
      list(iterator_fn())
      time_cost = time.time() - start
      times.append(time_cost)
    state_sampler.stop()

  print("Runtimes:", times)

  avg_runtime = sum(times) / len(times)
  print("Average runtime:", avg_runtime)
  print("Time per element:", avg_runtime / (input_per_source *
                                            num_sources))


if __name__ == '__main__':
  utils.check_compiled(
      'apache_beam.runners.worker.opcounters')
  run_benchmark()
  for i in range(num_input):
    values.append(random.randint(lower_bound, upper_bound))
  return values


def run_benchmark(num_runs=100, num_input=10000, seed=time.time()):
  total_time = 0
  random.seed(seed)
  lower_bound = 0
  upper_bound = sys.maxsize
  inputs = generate_input_values(num_input, lower_bound, upper_bound)
  from apache_beam.transforms import DataflowDistributionCounter
  print("Number of runs:", num_runs)
  print("Input size:", num_input)
  print("Input sequence from %d to %d" % (lower_bound, upper_bound))
  print("Random seed:", seed)
  for i in range(num_runs):
    counter = DataflowDistributionCounter()
    start = time.time()
    counter.add_inputs_for_test(inputs)
    time_cost = time.time() - start
    print("Run %d: Total time cost %g sec" % (i+1, time_cost))
    total_time += time_cost / num_input
  print("Per element update time cost:", total_time / num_runs)


if __name__ == '__main__':
  utils.check_compiled(
      'apache_beam.transforms.cy_dataflow_distribution_counter')
  run_benchmark()
Beispiel #4
0
                                    coders.GlobalWindowCoder()),
          globally_windowed_value),
      coder_benchmark_factory(
          coders.LengthPrefixCoder(coders.FastPrimitivesCoder()),
          small_int)
  ]

  suite = [utils.BenchmarkConfig(b, input_size, num_runs) for b in benchmarks
           if re.search(filter_regex, b.__name__, flags=re.I)]
  utils.run_benchmarks(suite, verbose=verbose)


if __name__ == "__main__":

  parser = argparse.ArgumentParser()
  parser.add_argument('--filter', default='.*')
  parser.add_argument('--num_runs', default=20, type=int)
  parser.add_argument('--num_elements_per_benchmark', default=1000, type=int)
  parser.add_argument('--seed', default=42, type=int)
  options = parser.parse_args()

  utils.check_compiled("apache_beam.coders.coder_impl")

  num_runs = 20
  num_elements_per_benchmark = 1000
  seed = 42  # Fix the seed for better consistency

  run_coder_benchmarks(
      options.num_runs, options.num_elements_per_benchmark, options.seed,
      verbose=True, filter_regex=options.filter)
Beispiel #5
0
 def setUp(self):
   try:
     utils.check_compiled('apache_beam.coders')
   except RuntimeError:
     self.skipTest('Cython is not installed')
    for i in range(num_runs):
        counter_factory = CounterFactory()
        state_sampler = statesampler.StateSampler('basic', counter_factory)
        state_sampler.start()
        with state_sampler.scoped_state('step1', 'state'):
            si_counter = opcounters.SideInputReadCounter(
                counter_factory, state_sampler, 'step1', 1)
            si_counter = opcounters.NoOpTransformIOCounter()
            sources = [
                FakeSource(long_generator(i, input_per_source))
                for i in range(num_sources)
            ]
            iterator_fn = sideinputs.get_iterator_fn_for_sources(
                sources, read_counter=si_counter)
            start = time.time()
            list(iterator_fn())
            time_cost = time.time() - start
            times.append(time_cost)
        state_sampler.stop()

    print("Runtimes:", times)

    avg_runtime = sum(times) // len(times)
    print("Average runtime:", avg_runtime)
    print("Time per element:", avg_runtime // (input_per_source * num_sources))


if __name__ == '__main__':
    utils.check_compiled('apache_beam.runners.worker.opcounters')
    run_benchmark()
import apache_beam as beam
from apache_beam.tools import utils
from scipy import stats


def run_benchmark(num_maps=100, num_runs=10, num_elements_step=1000):
  timings = {}
  for run in range(num_runs):
    num_elements = num_elements_step * run + 1
    start = time.time()
    with beam.Pipeline() as p:
      pc = p | beam.Create(list(range(num_elements)))
      for ix in range(num_maps):
        pc = pc | 'Map%d' % ix >> beam.FlatMap(lambda x: (None,))
    timings[num_elements] = time.time() - start
    print("%6d element%s %g sec" % (
        num_elements, " " if num_elements == 1 else "s", timings[num_elements]))

  print()
  # pylint: disable=unused-variable
  gradient, intercept, r_value, p_value, std_err = stats.linregress(
      *list(zip(*list(timings.items()))))
  print("Fixed cost  ", intercept)
  print("Per-element ", gradient / num_maps)
  print("R^2         ", r_value**2)


if __name__ == '__main__':
  utils.check_compiled('apache_beam.runners.common')
  run_benchmark()
Beispiel #8
0
                                    coders.GlobalWindowCoder()),
          globally_windowed_value),
      coder_benchmark_factory(
          coders.LengthPrefixCoder(coders.FastPrimitivesCoder()),
          small_int)
  ]

  suite = [utils.BenchmarkConfig(b, input_size, num_runs) for b in benchmarks
           if re.search(filter_regex, b.__name__, flags=re.I)]
  utils.run_benchmarks(suite, verbose=verbose)


if __name__ == "__main__":

  parser = argparse.ArgumentParser()
  parser.add_argument('--filter', default='.*')
  parser.add_argument('--num_runs', default=20, type=int)
  parser.add_argument('--num_elements_per_benchmark', default=1000, type=int)
  parser.add_argument('--seed', default=42, type=int)
  options = parser.parse_args()

  utils.check_compiled("apache_beam.coders.coder_impl")

  num_runs = 20
  num_elements_per_benchmark = 1000
  seed = 42  # Fix the seed for better consistency

  run_coder_benchmarks(
      options.num_runs, options.num_elements_per_benchmark, options.seed,
      verbose=True, filter_regex=options.filter)
Beispiel #9
0
from apache_beam.tools import utils
from scipy import stats


def run_benchmark(num_maps=100, num_runs=10, num_elements_step=1000):
    timings = {}
    for run in range(num_runs):
        num_elements = num_elements_step * run + 1
        start = time.time()
        with beam.Pipeline() as p:
            pc = p | beam.Create(list(range(num_elements)))
            for ix in range(num_maps):
                pc = pc | 'Map%d' % ix >> beam.FlatMap(lambda x: (None, ))
        timings[num_elements] = time.time() - start
        print("%6d element%s %g sec" %
              (num_elements, " " if num_elements == 1 else "s",
               timings[num_elements]))

    print()
    # pylint: disable=unused-variable
    gradient, intercept, r_value, p_value, std_err = stats.linregress(
        *list(zip(*list(timings.items()))))
    print("Fixed cost  ", intercept)
    print("Per-element ", gradient / num_maps)
    print("R^2         ", r_value**2)


if __name__ == '__main__':
    utils.check_compiled('apache_beam.runners.common')
    run_benchmark()
Beispiel #10
0
        values.append(random.randint(lower_bound, upper_bound))
    return values


def run_benchmark(num_runs=100, num_input=10000, seed=time.time()):
    total_time = 0
    random.seed(seed)
    lower_bound = 0
    upper_bound = sys.maxsize
    inputs = generate_input_values(num_input, lower_bound, upper_bound)
    from apache_beam.transforms import DataflowDistributionCounter
    print("Number of runs:", num_runs)
    print("Input size:", num_input)
    print("Input sequence from %d to %d" % (lower_bound, upper_bound))
    print("Random seed:", seed)
    for i in range(num_runs):
        counter = DataflowDistributionCounter()
        start = time.time()
        counter.add_inputs_for_test(inputs)
        time_cost = time.time() - start
        print("Run %d: Total time cost %g sec" % (i + 1, time_cost))
        total_time += time_cost / num_input
    print("Per element update time cost:", total_time / num_runs)


if __name__ == '__main__':
    logging.basicConfig()
    utils.check_compiled(
        'apache_beam.transforms.cy_dataflow_distribution_counter')
    run_benchmark()
Beispiel #11
0
 def setUp(self):
     try:
         utils.check_compiled('apache_beam.coders')
     except RuntimeError:
         self.skipTest('Cython is not installed')
Beispiel #12
0
    # pylint: disable=unused-variable
    for i in range(num_input):
        values.append(random.randint(lower_bound, upper_bound))
    return values


def run_benchmark(num_runs=100, num_input=10000, seed=time.time()):
    total_time = 0
    random.seed(seed)
    lower_bound = 0
    upper_bound = sys.maxint
    inputs = generate_input_values(num_input, lower_bound, upper_bound)
    print("Number of runs:", num_runs)
    print("Input size:", num_input)
    print("Input sequence from %d to %d" % (lower_bound, upper_bound))
    print("Random seed:", seed)
    for i in range(num_runs):
        counter = DistributionAccumulator()
        start = time.time()
        for value in inputs:
            counter.add_input(value)
        time_cost = time.time() - start
        print("Run %d: Total time cost %g sec" % (i + 1, time_cost))
        total_time += time_cost / num_input
    print("Per element update time cost:", total_time / num_runs)


if __name__ == '__main__':
    utils.check_compiled('apache_beam.transforms.cy_combiners')
    run_benchmark()