def main(): parser = get_parser() args = parser.parse_args(namespace=arguments.SmartNamespace()) if (args.stop_channel is None or args.stop_channel - args.start_channel > 1): if '%' not in args.output_file: parser.error( 'More than one channel selected but no %d in output filename') configure_logging(args) if args.write_profile or args.write_device_profile: profiling.Profiler.set_profiler(profiling.FlamegraphProfiler()) queue = None context = None if not args.host: context = accel.create_some_context(device_filter=lambda x: x.is_cuda) queue = context.create_command_queue() else: context = dummy_context() queue = DummyCommandQueue() with closing( loader.load(args.input_file, args.input_option, args.start_channel, args.stop_channel)) as dataset: frontend.run(args, context, queue, dataset, Writer(args, dataset)) profiler = profiling.Profiler.get_profiler() if args.write_profile: with open(args.write_profile, 'w') as f: assert isinstance(profiler, profiling.FlamegraphProfiler) profiler.write_flamegraph(f) if args.write_device_profile: with open(args.write_device_profile, 'w') as f: assert isinstance(profiler, profiling.FlamegraphProfiler) profiler.write_device_flamegraph(f)
def main(): ctx = create_some_context() queue = ctx.create_command_queue() op = SumTemplate(ctx).instantiate(queue, 1024) op.ensure_all_bound() src = np.random.randint(1, 100, size=op.buffer('src').shape).astype(np.int32) op.buffer('src').set(queue, src) op() dest = op.buffer('dest').get(queue) wgs = op.template.wgs expected = src.reshape(-1, wgs).sum(axis=1) np.testing.assert_equal(dest, expected) print(dest)
def benchmark_fft(args): context = accel.create_some_context() queue = context.create_tuning_command_queue() allocator = accel.SVMAllocator(context) shape = (args.pixels, args.pixels) template = fft.FftTemplate(queue, 2, shape, np.complex64, np.complex64, shape, shape) fn = template.instantiate(args.mode, allocator=allocator) fn.ensure_all_bound() # Zero-fill, just to ensure no NaNs etc fn.buffer('src').fill(0) fn.buffer('dest').fill(0) fn() # Warm-up and forces data transfer queue.start_tuning() fn() elapsed = queue.stop_tuning() print('{pixels}x{pixels} in {elapsed:.6f} seconds'.format(pixels=args.pixels, elapsed=elapsed)) # 8 bytes for complex64, 4 accesses (from source, to/from scratch, to dest) mem_rate = args.pixels * args.pixels * 8 * 4 / elapsed print('{:.3f} GiB/s'.format(mem_rate / 1024**3))
def benchmark_grid_degrid(args): n_time = 3600 add_parameters(args) N = n_time * len(args.antennas) * (len(args.antennas) - 1) // 2 reader = make_compressed_vis(args, n_time) context = accel.create_some_context() queue = context.create_tuning_command_queue() gridder_template = args.template_class(context, args.image_parameters, args.grid_parameters, tuning=args.tuning) gridder = gridder_template.instantiate(queue, args.array_parameters, N) gridder.ensure_all_bound() elapsed = 0.0 N_compressed = 0 uv = gridder.buffer('uv').empty_like() w_plane = gridder.buffer('w_plane').empty_like() vis = gridder.buffer('vis').empty_like() for w_slice in range(reader.num_w_slices(0)): gridder.num_vis = reader.len(0, w_slice) N_compressed += gridder.num_vis if gridder.num_vis > 0: gridder.buffer('grid').zero(queue) start = 0 for chunk in reader.iter_slice(0, w_slice): rng = slice(start, start + len(chunk)) uv[rng, 0:2] = chunk['uv'] uv[rng, 2:4] = chunk['sub_uv'] w_plane[rng] = chunk['w_plane'] vis[rng] = chunk['vis'] start += len(chunk) gridder.buffer('uv').set_async(queue, uv) gridder.buffer('w_plane').set_async(queue, w_plane) gridder.buffer('vis').set_async(queue, vis) queue.finish() queue.start_tuning() gridder() elapsed += queue.stop_tuning() queue.finish() gaps = N_compressed * args.grid_parameters.kernel_width**2 * args.polarizations / elapsed print('Processed {} ({}) visibilities in {:.6f}s with kernel size {} and {} polarizations' .format(N_compressed, N, elapsed, args.grid_parameters.kernel_width, args.polarizations)) print('{:.3f} GGAPS uncompressed'.format(gaps * N / N_compressed / 1e9)) print('{:.3f} GGAPS compressed'.format(gaps / 1e9))
def main(): parser = get_parser() args = parser.parse_args(namespace=arguments.SmartNamespace()) katsdpservices.setup_logging() if args.log_level is not None: logger.setLevel(args.log_level.upper()) profiling.Profiler.set_profiler(profiling.FlamegraphProfiler()) with closing( loader.load(args.input_file, args.input_option, args.start_channel, args.stop_channel)) as dataset: writer = Writer(args, dataset) context = accel.create_some_context(interactive=False, device_filter=lambda x: x.is_cuda) queue = context.create_command_queue() frontend.run(args, context, queue, dataset, writer) # frontend.run modifies args.stop_channel in place, so even if it # wasn't specified by the user it will now be valid. writer.finalize(dataset, args.start_channel, args.stop_channel)
#!/usr/bin/env python # for nosetest: nosetests katsdpsigproc.test.test_maskedsum import time import numpy as np from katsdpsigproc import accel, maskedsum context = accel.create_some_context(True) queue = context.create_command_queue(profile=True) data = np.random.randn(4000, 5000, 2).astype(np.float32).view(dtype=np.complex64)[..., 0] mask = np.ones((4000, )).astype(np.float32) template = maskedsum.MaskedSumTemplate(context) msum = template.instantiate(queue, data.shape) msum.ensure_all_bound() msum.buffer('src').set(queue, data) msum.buffer('mask').set(queue, mask) start_event = queue.enqueue_marker() msum() end_event = queue.enqueue_marker() out = msum.buffer('dest').get(queue) t0 = time.time() expected = np.sum(data * mask.reshape(data.shape[0], 1), axis=0).astype(np.complex64) t1 = time.time() print('gpu:', end_event.time_since(start_event), 'cpu:', t1 - t0) np.testing.assert_equal(out.reshape(-1), expected)
def benchmark1d(args, data): if args.width % 2 != 1: raise argparse.ArgumentError('Width must be odd') if data.shape[0] <= args.width: raise argparse.ArgumentError( 'Channels cannot be less than the filter width') context = None if not args.host: try: context = accel.create_some_context(True) except RuntimeError: print("No devices available. Executing on the CPU.", file=sys.stderr) if context is None: background = katsdpsigproc.rfi.host.BackgroundMedianFilterHost( args.width) noise_est = katsdpsigproc.rfi.host.NoiseEstMADHost() threshold = katsdpsigproc.rfi.host.ThresholdSumHost(args.sigmas) flagger = katsdpsigproc.rfi.host.FlaggerHost(background, noise_est, threshold) start = time.time() flags = flagger(data) end = time.time() print("CPU time (ms):", (end - start) * 1000.0) else: command_queue = context.create_command_queue(profile=True) background = katsdpsigproc.rfi.device.BackgroundMedianFilterDeviceTemplate( context, args.width) noise_est = katsdpsigproc.rfi.device.NoiseEstMADTDeviceTemplate( context, 10240) threshold = katsdpsigproc.rfi.device.ThresholdSumDeviceTemplate( context) template = katsdpsigproc.rfi.device.FlaggerDeviceTemplate( background, noise_est, threshold) flagger = template.instantiate(command_queue, data.shape[0], data.shape[1], threshold_args={'n_sigma': args.sigmas}) flagger.ensure_all_bound() data_device = flagger.buffer('vis') flags_device = flagger.buffer('flags') data_device.set(command_queue, data) # Run once for warmup (allocates memory) flagger() # Run again, timing it command_queue.finish() start_time = time.time() start_event = command_queue.enqueue_marker() flagger() end_event = command_queue.enqueue_marker() command_queue.finish() end_time = time.time() flags = flags_device.get(command_queue) print("Host time (ms): ", (end_time - start_time) * 1000.0) try: device_time = end_event.time_since(start_event) * 1000.0 except Exception: # AMD CPU device doesn't seem to support profiling on marker events device_time = 'unknown' print("Device time (ms):", device_time) return flags
def main(): parser = argparse.ArgumentParser() parser.add_argument('--vis', type=int, default=10**6) parser.add_argument('--sources', type=int, default=10**4) args = parser.parse_args() image_parameters = parameters.ImageParameters( q_fov=1.0, image_oversample=None, frequency=0.2 * units.m, array=None, polarizations=polarization.STOKES_IQUV, dtype=np.float64, pixel_size=0.00001, pixels=4096) oversample = 8 w_planes = 100 grid_parameters = parameters.GridParameters( antialias_width=7.0, oversample=oversample, image_oversample=4, w_slices=10, w_planes=w_planes, max_w=5 * units.m, kernel_width=7) base_sources = [ "dummy0, radec, 19:39:25.03, -63:42:45.7, (200.0 12000.0 -11.11 7.777 -1.231 0 0 0 1 0.1 0 0)", # noqa: E501 "dummy1, radec, 19:39:20.38, -63:42:09.1, (800.0 8400.0 -3.708 3.807 -0.7202 0 0 0 1 0.2 0.2 0.2)", # noqa: E501 "dummy2, radec, 19:39:08.29, -63:42:33.0, (800.0 43200.0 0.956 0.584 -0.1644 0 0 0 1 0.1 0 1)" # noqa: E501 ] sources = [] for i in range(args.sources): sources.append(str(uuid.uuid4()) + base_sources[i % len(base_sources)]) model = sky_model.KatpointSkyModel(katpoint.Catalogue(sources)) phase_centre = katpoint.construct_radec_target( '19:39:30', '-63:42:30').astrometric_radec() * units.rad rs = RandomState(seed=1) uv = rs.random_integers(-2048, 2048, size=(args.vis, 2)).astype(np.int16) sub_uv = rs.random_integers(0, grid_parameters.oversample - 1, size=(args.vis, 2)).astype(np.int16) w_plane = rs.random_integers(0, grid_parameters.w_planes - 1, size=args.vis).astype(np.int16) weights = rs.uniform(size=(args.vis, len(image_parameters.polarizations))).astype(np.float32) vis = rs.complex_normal(size=(args.vis, len(image_parameters.polarizations))) context = accel.create_some_context(device_filter=lambda x: x.is_cuda) queue = context.create_command_queue() allocator = accel.SVMAllocator(context) template = predict.PredictTemplate(context, np.float32, len(image_parameters.polarizations)) fn = template.instantiate(queue, image_parameters, grid_parameters, args.vis, len(model), allocator=allocator) fn.ensure_all_bound() fn.num_vis = args.vis fn.set_coordinates(uv, sub_uv, w_plane) fn.set_vis(vis) fn.set_weights(weights) fn.set_sky_model(model, phase_centre) fn.set_w(1.2) fn() fn() queue.finish()