def test_construction_from_url(self): view, cbid, sn, _, _ = make_fake_data_source(self.telstate, self.store, (20, 16, 40)) source_direct = TelstateDataSource(view, cbid, sn, self.store) # Save RDB file to e.g. 'tempdir/cb/cb_sdp_l0.rdb', as if 'tempdir' is a real S3 bucket rdb_dir = os.path.join(self.tempdir, cbid) os.mkdir(rdb_dir) rdb_filename = os.path.join(rdb_dir, f'{cbid}_{sn}.rdb') # Insert CBID and stream name at the top level, just like metawriter does self.telstate['capture_block_id'] = cbid self.telstate['stream_name'] = sn with RDBWriter(rdb_filename) as rdbw: rdbw.save(self.telstate) # Check that we can open RDB file and automatically infer the chunk store source_from_file = open_data_source(rdb_filename) assert_telstate_data_source_equal(source_from_file, source_direct) # Check that we can override the capture_block_id and stream name via query parameters query = urllib.parse.urlencode({ 'capture_block_id': cbid, 'stream_name': sn }) url = urllib.parse.urlunparse( ('file', '', rdb_filename, '', query, '')) source_from_url = TelstateDataSource.from_url(url, chunk_store=self.store) assert_telstate_data_source_equal(source_from_url, source_direct) # Check invalid URLs with assert_raises(DataSourceNotFound): open_data_source('ftp://unsupported') with assert_raises(DataSourceNotFound): open_data_source(rdb_filename[:-4])
def test_rdb_support(self): telstate = katsdptelstate.TelescopeState() view, cbid, sn, _, _ = make_fake_data_source(telstate, self.store, (5, 16, 40), PREFIX) telstate['capture_block_id'] = cbid telstate['stream_name'] = sn # Save telstate to temp RDB file since RDBWriter needs a filename and not a handle rdb_filename = f'{cbid}_{sn}.rdb' temp_filename = os.path.join(self.tempdir, rdb_filename) with RDBWriter(temp_filename) as rdbw: rdbw.save(telstate) # Read the file back in and upload it to S3 with open(temp_filename, mode='rb') as rdb_file: rdb_data = rdb_file.read() rdb_url = urllib.parse.urljoin(self.store_url, self.store.join(cbid, rdb_filename)) self.store.create_array(cbid) self.store.complete_request('PUT', rdb_url, data=rdb_data) # Check that data source can be constructed from URL (with auto chunk store) source_from_url = TelstateDataSource.from_url(rdb_url, **self.store_kwargs) source_direct = TelstateDataSource(view, cbid, sn, self.store) assert_telstate_data_source_equal(source_from_url, source_direct)
def main(): args = parse_args() dask.config.set(num_workers=args.workers) # Lightweight open with no data - just to create telstate and identify the CBID ds = TelstateDataSource.from_url(args.source, upgrade_flags=False, chunk_store=None) # View the CBID, but not any specific stream cbid = ds.capture_block_id telstate = ds.telstate.root().view(cbid) streams = get_streams(telstate, args.streams) # Find all arrays in the selected streams, and also ensure we're not # trying to write things back on top of an existing dataset. arrays = {} for stream_name in streams: sts = view_capture_stream(telstate, cbid, stream_name) try: chunk_info = sts['chunk_info'] except KeyError as exc: raise RuntimeError('Could not get chunk info for {!r}: {}'.format( stream_name, exc)) for array_name, array_info in chunk_info.items(): if args.new_prefix is not None: array_info[ 'prefix'] = args.new_prefix + '-' + stream_name.replace( '_', '-') prefix = array_info['prefix'] path = os.path.join(args.dest, prefix) if os.path.exists(path): raise RuntimeError( 'Directory {!r} already exists'.format(path)) store = get_chunk_store(args.source, sts, array_name) # Older files have dtype as an object that can't be encoded in msgpack dtype = np.dtype(array_info['dtype']) array_info['dtype'] = np.lib.format.dtype_to_descr(dtype) arrays[(stream_name, array_name)] = Array(stream_name, array_name, store, array_info) # Apply DATA_LOST bits to the flags arrays. This is a less efficient approach than # datasources.py, but much simpler. for stream_name in streams: flags_array = arrays.get((stream_name, 'flags')) if not flags_array: continue sources = [stream_name] sts = view_capture_stream(telstate, cbid, stream_name) sources += sts['src_streams'] for src_stream in sources: if src_stream not in streams: continue src_ts = view_capture_stream(telstate, cbid, src_stream) for array_name in src_ts['chunk_info']: if array_name == 'flags' and src_stream != stream_name: # Upgraded flags completely replace the source stream's # flags, rather than augmenting them. Thus, data lost in # the source stream has no effect. continue lost_flags = arrays[(src_stream, array_name)].lost_flags lost_flags = lost_flags.rechunk( flags_array.data.chunks[:lost_flags.ndim]) # weights_channel doesn't have a baseline axis while lost_flags.ndim < flags_array.data.ndim: lost_flags = lost_flags[..., np.newaxis] lost_flags = da.broadcast_to(lost_flags, flags_array.data.shape, chunks=flags_array.data.chunks) flags_array.data |= lost_flags # Apply the rechunking specs for spec in args.spec: key = (spec.stream, spec.array) if key not in arrays: raise RuntimeError('{}/{} is not a known array'.format( spec.stream, spec.array)) arrays[key].data = arrays[key].data.rechunk({ 0: spec.time, 1: spec.freq }) # Write out the new data dest_store = NpyFileChunkStore(args.dest) stores = [] for array in arrays.values(): full_name = dest_store.join(array.chunk_info['prefix'], array.array_name) dest_store.create_array(full_name) stores.append(dest_store.put_dask_array(full_name, array.data)) array.chunk_info['chunks'] = array.data.chunks stores = da.compute(*stores) # put_dask_array returns an array with an exception object per chunk for result_set in stores: for result in result_set.flat: if result is not None: raise result # Fix up chunk_info for new chunking for stream_name in streams: sts = view_capture_stream(telstate, cbid, stream_name) chunk_info = sts['chunk_info'] for array_name in chunk_info.keys(): chunk_info[array_name] = arrays[(stream_name, array_name)].chunk_info sts.wrapped.delete('chunk_info') sts.wrapped['chunk_info'] = chunk_info # s3_endpoint_url is for the old version of the data sts.wrapped.delete('s3_endpoint_url') if args.s3_endpoint_url is not None: sts.wrapped['s3_endpoint_url'] = args.s3_endpoint_url # Write updated RDB file url_parts = urllib.parse.urlparse(args.source, scheme='file') dest_file = os.path.join(args.dest, args.new_prefix or cbid, os.path.basename(url_parts.path)) os.makedirs(os.path.dirname(dest_file), exist_ok=True) with RDBWriter(dest_file) as writer: writer.save(telstate.backend)