def convert_ms(infile, outfile=None, ddis=None, ignore=['HISTORY'], compressor=None, chunks=(100, 400, 32, 1), sub_chunks=10000, append=False): """ Convert legacy format MS to xarray Visibility Dataset and zarr storage format This function requires CASA6 casatools module. The CASA MSv2 format is converted to the MSv3 schema per the specified definition here: https://drive.google.com/file/d/10TZ4dsFw9CconBc-GFxSeb2caT6wkmza/view?usp=sharing The MS is partitioned by DDI, which guarantees a fixed data shape per partition. This results in different subdirectories under the main vis.zarr folder. There is no DDI in MSv3, so this simply serves as a partition id in the zarr directory. Parameters ---------- infile : str Input MS filename outfile : str Output zarr filename. If None, will use infile name with .vis.zarr extension ddis : list List of specific DDIs to convert. DDI's are integer values, or use 'global' string for subtables. Leave as None to convert entire MS ignore : list List of subtables to ignore (case sensitive and generally all uppercase). This is useful if a particular subtable is causing errors. Default is None. Note: default is now temporarily set to ignore the HISTORY table due a CASA6 issue in the table tool affecting a small set of test cases (set back to None if HISTORY is needed) compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. chunks: 4-D tuple of ints Shape of desired chunking in the form of (time, baseline, channel, polarization), use -1 for entire axis in one chunk. Default is (100, 400, 20, 1) Note: chunk size is the product of the four numbers, and data is batch processed by time axis, so that will drive memory needed for conversion. sub_chunks: int Chunking used for subtable conversion (except for POINTING which will use time/baseline dims from chunks parameter). This is a single integer used for the row-axis (d0) chunking only, no other dims in the subtables will be chunked. append : bool Keep destination zarr store intact and add new DDI's to it. Note that duplicate DDI's will still be overwritten. Default False deletes and replaces entire directory. Returns ------- xarray.core.dataset.Dataset Master xarray dataset of datasets for this visibility set """ import itertools import os import xarray import dask.array as da import numpy as np import time import cngi._utils._table_conversion as tblconv import cngi._utils._io as xdsio import warnings import importlib_metadata warnings.filterwarnings('ignore', category=FutureWarning) # parse filename to use infile = os.path.expanduser(infile) prefix = infile[:infile.rindex('.')] if outfile is None: outfile = prefix + '.vis.zarr' outfile = os.path.expanduser(outfile) # need to manually remove existing zarr file (if any) if not append: os.system("rm -fr " + outfile) os.system("mkdir " + outfile) # as part of MSv3 conversion, these columns in the main table are no longer needed ignorecols = ['FLAG_CATEGORY', 'FLAG_ROW', 'DATA_DESC_ID'] if ignore is None: ignore = [] # we need to assume an explicit ordering of dims dimorder = ['time', 'baseline', 'chan', 'pol'] # we need the spectral window, polarization, and data description tables for processing the main table spw_xds = tblconv.convert_simple_table(infile, outfile='', subtable='SPECTRAL_WINDOW', ignore=ignorecols, nofile=True, add_row_id=True) pol_xds = tblconv.convert_simple_table(infile, outfile='', subtable='POLARIZATION', ignore=ignorecols, nofile=True) ddi_xds = tblconv.convert_simple_table(infile, outfile='', subtable='DATA_DESCRIPTION', ignore=ignorecols, nofile=True) # let's assume that each DATA_DESC_ID (ddi) is a fixed shape that may differ from others # form a list of ddis to process, each will be placed it in its own xarray dataset and partition if ddis is None: ddis = list(ddi_xds['d0'].values) + ['global'] else: ddis = np.atleast_1d(ddis) xds_list = [] # extra data selection to split autocorr and crosscorr into separate xds # extrasels[0] is for autocorrelation # extrasels[1] is for others (corsscorrelations, correlations between feeds) extrasels = [ 'ANTENNA1 == ANTENNA2 && FEED1 == FEED2', 'ANTENNA1 != ANTENNA2 || FEED1 != FEED2' ] #################################################################### # process each selected DDI from the input MS, assume a fixed shape within the ddi (should always be true) # each DDI is written to its own subdirectory under the parent folder for extrasel, ddi in itertools.product(extrasels, ddis): if ddi == 'global': continue # handled afterwards extra_sel_index = extrasels.index(extrasel) if extra_sel_index == 0: xds_prefix = 'xdsa' else: xds_prefix = 'xds' xds_name = f'{xds_prefix}{ddi}' ddi = int(ddi) print('Processing ddi', ddi, f'xds name is {xds_name}', end='\r') start_ddi = time.time() # these columns are different / absent in MSv3 or need to be handled as special cases msv2 = [ 'WEIGHT', 'WEIGHT_SPECTRUM', 'SIGMA', 'SIGMA_SPECTRUM', 'ANTENNA1', 'ANTENNA2', 'UVW' ] # convert columns that are common to MSv2 and MSv3 xds = tblconv.convert_expanded_table(infile, os.path.join(outfile, xds_name), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={ 'd2': 'chan', 'd3': 'pol' }, ignore=ignorecols + msv2, compressor=compressor, chunks=chunks, nofile=False, extraselstr=extrasel) if len(xds.dims) == 0: continue # convert and append UVW separately so we can handle its special dimension uvw_chunks = (chunks[0], chunks[1], 3) #No chunking over uvw_index uvw_xds = tblconv.convert_expanded_table( infile, os.path.join(outfile, 'tmp'), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={'d2': 'uvw_index'}, ignore=ignorecols + list(xds.data_vars) + msv2[:-1], compressor=compressor, chunks=uvw_chunks, nofile=False, extraselstr=extrasel) uvw_xds.to_zarr(os.path.join(outfile, xds_name), mode='a', compute=True, consolidated=True) # convert and append the ANTENNA1 and ANTENNA2 columns separately so we can squash the unnecessary time dimension ant_xds = tblconv.convert_expanded_table( infile, os.path.join(outfile, 'tmp'), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], ignore=ignorecols + list(xds.data_vars) + msv2[:4] + ['UVW'], compressor=compressor, chunks=chunks[:2], nofile=False, extraselstr=extrasel) ant_xds = ant_xds.assign({ 'ANTENNA1': ant_xds.ANTENNA1.max(axis=0), 'ANTENNA2': ant_xds.ANTENNA2.max(axis=0) }).drop_dims('time') ant_xds.to_zarr(os.path.join(outfile, xds_name), mode='a', compute=True, consolidated=True) # now convert just the WEIGHT and WEIGHT_SPECTRUM (if preset) # WEIGHT needs to be expanded to full dimensionality (time, baseline, chan, pol) wt_xds = tblconv.convert_expanded_table( infile, os.path.join(outfile, 'tmp'), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={}, ignore=ignorecols + list(xds.data_vars) + msv2[-3:], compressor=compressor, chunks=chunks, nofile=False, extraselstr=extrasel) # MSv3 changes to weight/sigma column handling # 1. DATA_WEIGHT = 1/sqrt(SIGMA) # 2. CORRECTED_DATA_WEIGHT = WEIGHT # 3. if SIGMA_SPECTRUM or WEIGHT_SPECTRUM present, use them instead of SIGMA and WEIGHT # 4. discard SIGMA, WEIGHT, SIGMA_SPECTRUM and WEIGHT_SPECTRUM from converted ms # 5. set shape of DATA_WEIGHT / CORRECTED_DATA_WEIGHT to (time, baseline, chan, pol) padding as necessary if 'DATA' in xds.data_vars: if 'SIGMA_SPECTRUM' in wt_xds.data_vars: wt_xds = wt_xds.rename( dict(zip(wt_xds.SIGMA_SPECTRUM.dims, dimorder))).assign( {'DATA_WEIGHT': 1 / wt_xds.SIGMA_SPECTRUM**2}) elif 'SIGMA' in wt_xds.data_vars: wts = wt_xds.SIGMA.shape[:2] + (1, ) + ( wt_xds.SIGMA.shape[-1], ) wt_da = da.tile(da.reshape(wt_xds.SIGMA.data, wts), (1, 1, len(xds.chan), 1)).rechunk(chunks) wt_xds = wt_xds.assign({ 'DATA_WEIGHT': xarray.DataArray(1 / wt_da**2, dims=dimorder) }) if 'CORRECTED_DATA' in xds.data_vars: if 'WEIGHT_SPECTRUM' in wt_xds.data_vars: wt_xds = wt_xds.rename( dict(zip(wt_xds.WEIGHT_SPECTRUM.dims, dimorder))).assign( {'CORRECTED_DATA_WEIGHT': wt_xds.WEIGHT_SPECTRUM}) elif 'WEIGHT' in wt_xds.data_vars: wts = wt_xds.WEIGHT.shape[:2] + (1, ) + ( wt_xds.WEIGHT.shape[-1], ) wt_da = da.tile(da.reshape(wt_xds.WEIGHT.data, wts), (1, 1, len(xds.chan), 1)).rechunk(chunks) wt_xds = wt_xds.assign({ 'CORRECTED_DATA_WEIGHT': xarray.DataArray(wt_da, dims=dimorder) }) wt_xds = wt_xds.drop([cc for cc in msv2 if cc in wt_xds.data_vars]) wt_xds.to_zarr(os.path.join(outfile, xds_name), mode='a', compute=True, consolidated=True) # add in relevant data grouping, spw and polarization attributes attrs = {'data_groups': [{}]} if ('DATA' in xds.data_vars) and ('DATA_WEIGHT' in wt_xds.data_vars): attrs['data_groups'][0][str(len(attrs['data_groups'][0]))] = { 'id': str(len(attrs['data_groups'][0])), 'data': 'DATA', 'uvw': 'UVW', 'flag': 'FLAG', 'weight': 'DATA_WEIGHT' } if ('CORRECTED_DATA' in xds.data_vars) and ('CORRECTED_DATA_WEIGHT' in wt_xds.data_vars): attrs['data_groups'][0][str(len(attrs['data_groups'][0]))] = { 'id': str(len(attrs['data_groups'][0])), 'data': 'CORRECTED_DATA', 'uvw': 'UVW', 'flag': 'FLAG', 'weight': 'CORRECTED_DATA_WEIGHT' } for dv in spw_xds.data_vars: attrs[dv.lower()] = spw_xds[dv].values[ ddi_xds['spectral_window_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools for dv in pol_xds.data_vars: attrs[dv.lower()] = pol_xds[dv].values[ ddi_xds['polarization_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools # grab the channel frequency values from the spw table data and pol idxs from the polarization table, add spw and pol ids chan = attrs.pop('chan_freq')[:len(xds.chan)] pol = attrs.pop('corr_type')[:len(xds.pol)] # truncate per-chan values to the actual number of channels and move to coordinates chan_width = xarray.DataArray(da.from_array( attrs.pop('chan_width')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) effective_bw = xarray.DataArray(da.from_array( attrs.pop('effective_bw')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) resolution = xarray.DataArray(da.from_array( attrs.pop('resolution')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) coords = { 'chan': chan, 'pol': pol, 'spw_id': [ddi_xds['spectral_window_id'].values[ddi]], 'pol_id': [ddi_xds['polarization_id'].values[ddi]], 'chan_width': chan_width, 'effective_bw': effective_bw, 'resolution': resolution } aux_xds = xarray.Dataset(coords=coords, attrs=attrs) aux_xds.to_zarr(os.path.join(outfile, xds_name), mode='a', compute=True, consolidated=True) xds = xarray.open_zarr(os.path.join(outfile, xds_name)) xds_list += [(xds_name, xds)] print('Completed ddi %i process time {:0.2f} s'.format(time.time() - start_ddi) % ddi) # clean up the tmp directory created by the weight conversion to MSv3 os.system("rm -fr " + os.path.join(outfile, 'tmp')) # convert other subtables to their own partitions, denoted by 'global_' prefix skip_tables = ['DATA_DESCRIPTION', 'SORTED_TABLE'] + ignore subtables = sorted([ tt for tt in os.listdir(infile) if os.path.isdir(os.path.join(infile, tt)) and tt not in skip_tables ]) if 'global' in ddis: start_ddi = time.time() for ii, subtable in enumerate(subtables): print('processing subtable %i of %i : %s' % (ii, len(subtables), subtable), end='\r') if subtable == 'POINTING': # expand the dimensions of the pointing table xds_sub_list = [(subtable, tblconv.convert_expanded_table( infile, os.path.join(outfile, 'global'), subtable=subtable, keys={ 'TIME': 'time', 'ANTENNA_ID': 'antenna_id' }, timecols=['time'], chunks=chunks))] else: add_row_id = (subtable in [ 'ANTENNA', 'FIELD', 'OBSERVATION', 'SCAN', 'SPECTRAL_WINDOW', 'STATE' ]) xds_sub_list = [(subtable, tblconv.convert_simple_table( infile, os.path.join(outfile, 'global'), subtable, timecols=['TIME'], ignore=ignorecols, compressor=compressor, nofile=False, chunks=(sub_chunks, -1), add_row_id=add_row_id))] if len(xds_sub_list[-1][1].dims) != 0: xds_list += xds_sub_list #else: # print('Empty Subtable:',subtable) print( 'Completed subtables process time {:0.2f} s'.format(time.time() - start_ddi)) # write sw version that did this conversion to zarr directory try: version = importlib_metadata.version('cngi-prototype') except: version = '0.0.0' with open(outfile + '/.version', 'w') as fid: fid.write('cngi-protoype ' + version + '\n') # build the master xds to return mxds = xdsio.vis_xds_packager(xds_list) print(' ' * 50) return mxds
def convert_ms(infile, outfile=None, ddis=None, ignore=['HISTORY'], compressor=None, chunk_shape=(100, 400, 32, 1), append=False): """ Convert legacy format MS to xarray Visibility Dataset and zarr storage format This function requires CASA6 casatools module. The CASA MSv2 format is converted to the MSv3 schema per the specified definition here: https://drive.google.com/file/d/10TZ4dsFw9CconBc-GFxSeb2caT6wkmza/view?usp=sharing The MS is partitioned by DDI, which guarentees a fixed data shape per partition. This results in different subdirectories under the main vis.zarr folder. There is no DDI in MSv3, so this simply serves as a partition id in the zarr directory. Parameters ---------- infile : str Input MS filename outfile : str Output zarr filename. If None, will use infile name with .vis.zarr extension ddis : list List of specific DDIs to convert. DDI's are integer values, or use 'global' string for subtables. Leave as None to convert entire MS ignore : list List of subtables to ignore (case sensitive and generally all uppercase). This is useful if a particular subtable is causing errors. Default is None. Note: default is now temporarily set to ignore the HISTORY table due a CASA6 issue in the table tool affecting a small set of test cases (set back to None if HISTORY is needed) compressor : numcodecs.blosc.Blosc The blosc compressor to use when saving the converted data to disk using zarr. If None the zstd compression algorithm used with compression level 2. chunk_shape: 4-D tuple of ints Shape of desired chunking in the form of (time, baseline, channel, polarization), use -1 for entire axis in one chunk. Default is (100, 400, 20, 1) Note: chunk size is the product of the four numbers, and data is batch processed by time axis, so that will drive memory needed for conversion. append : bool Keep destination zarr store intact and add new DDI's to it. Note that duplicate DDI's will still be overwritten. Default False deletes and replaces entire directory. Returns ------- xarray.core.dataset.Dataset Master xarray dataset of datasets for this visibility set """ import os import xarray import dask.array as da import numpy as np import time import cngi._utils._table_conversion as tblconv import cngi._utils._io as xdsio import warnings import importlib_metadata warnings.filterwarnings('ignore', category=FutureWarning) # parse filename to use infile = os.path.expanduser(infile) prefix = infile[:infile.rindex('.')] if outfile is None: outfile = prefix + '.vis.zarr' outfile = os.path.expanduser(outfile) # need to manually remove existing zarr file (if any) if not append: os.system("rm -fr " + outfile) os.system("mkdir " + outfile) # as part of MSv3 conversion, these columns in the main table are no longer needed ignorecols = ['FLAG_CATEGORY', 'FLAG_ROW', 'DATA_DESC_ID'] if ignore is None: ignore = [] # we need the spectral window, polarization, and data description tables for processing the main table spw_xds = tblconv.convert_simple_table(infile, outfile='', subtable='SPECTRAL_WINDOW', ignore=ignorecols, nofile=True) pol_xds = tblconv.convert_simple_table(infile, outfile='', subtable='POLARIZATION', ignore=ignorecols, nofile=True) ddi_xds = tblconv.convert_simple_table(infile, outfile='', subtable='DATA_DESCRIPTION', ignore=ignorecols, nofile=True) # let's assume that each DATA_DESC_ID (ddi) is a fixed shape that may differ from others # form a list of ddis to process, each will be placed it in its own xarray dataset and partition if ddis is None: ddis = list(ddi_xds['d0'].values) + ['global'] else: ddis = np.atleast_1d(ddis) xds_list = [] #################################################################### # process each selected DDI from the input MS, assume a fixed shape within the ddi (should always be true) # each DDI is written to its own subdirectory under the parent folder for ddi in ddis: if ddi == 'global': continue # handled afterwards ddi = int(ddi) print('Processing ddi', ddi, end='\r') start_ddi = time.time() # these columns are different / absent in MSv3 or need to be handled as special cases msv2 = ['WEIGHT', 'WEIGHT_SPECTRUM', 'SIGMA', 'SIGMA_SPECTRUM', 'UVW'] # convert columns that are common to MSv2 and MSv3 xds = tblconv.convert_expanded_table(infile, os.path.join( outfile, 'xds' + str(ddi)), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={ 'd2': 'chan', 'd3': 'pol' }, ignore=ignorecols + msv2, compressor=compressor, chunk_shape=chunk_shape, nofile=False) # convert and append UVW separately so we can handle its special dimension uvw_xds = tblconv.convert_expanded_table( infile, os.path.join(outfile, 'tmp'), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={'d2': 'uvw_index'}, ignore=ignorecols + list(xds.data_vars) + msv2[:-1], compressor=compressor, chunk_shape=chunk_shape, nofile=False) uvw_xds.to_zarr(os.path.join(outfile, 'xds' + str(ddi)), mode='a', compute=True, consolidated=True) # now convert just the WEIGHT and WEIGHT_SPECTRUM (if preset) # WEIGHT needs to be expanded to full dimensionality (time, baseline, chan, pol) wt_xds = tblconv.convert_expanded_table(infile, os.path.join(outfile, 'tmp'), keys={ 'TIME': 'time', ('ANTENNA1', 'ANTENNA2'): 'baseline' }, subsel={'DATA_DESC_ID': ddi}, timecols=['time'], dimnames={}, ignore=ignorecols + list(xds.data_vars) + msv2[2:], compressor=compressor, chunk_shape=chunk_shape, nofile=False) # if WEIGHT_SPECTRUM is present, append it to the main xds as the new WEIGHT column # otherwise expand the dimensionality of WEIGHT and add it to the xds if 'WEIGHT_SPECTRUM' in wt_xds.data_vars: wt_xds = wt_xds.drop_vars('WEIGHT').rename( dict( zip(wt_xds.WEIGHT_SPECTRUM.dims, ['time', 'baseline', 'chan', 'pol']))) wt_xds.to_zarr(os.path.join(outfile, 'xds' + str(ddi)), mode='a', compute=True, consolidated=True) else: wts = wt_xds.WEIGHT.shape[:2] + (1, ) + (wt_xds.WEIGHT.shape[-1], ) wt_da = da.tile(da.reshape(wt_xds.WEIGHT.data, wts), (1, 1, len(xds.chan), 1)).rechunk(chunk_shape) wt_xds = wt_xds.drop_vars('WEIGHT').assign({ 'WEIGHT': xarray.DataArray(wt_da, dims=['time', 'baseline', 'chan', 'pol']) }) wt_xds.to_zarr(os.path.join(outfile, 'xds' + str(ddi)), mode='a', compute=True, consolidated=True) # add in relevant spw and polarization attributes attrs = {} for dv in spw_xds.data_vars: attrs[dv.lower()] = spw_xds[dv].values[ ddi_xds['spectral_window_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools for dv in pol_xds.data_vars: attrs[dv.lower()] = pol_xds[dv].values[ ddi_xds['polarization_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools # grab the channel frequency values from the spw table data and pol idxs from the polarization table, add spw and pol ids chan = attrs.pop('chan_freq')[:len(xds.chan)] pol = attrs.pop('corr_type')[:len(xds.pol)] # truncate per-chan values to the actual number of channels and move to coordinates chan_width = xarray.DataArray(attrs.pop('chan_width')[:len(xds.chan)], dims=['chan']) effective_bw = xarray.DataArray( attrs.pop('effective_bw')[:len(xds.chan)], dims=['chan']) resolution = xarray.DataArray(attrs.pop('resolution')[:len(xds.chan)], dims=['chan']) coords = { 'chan': chan, 'pol': pol, 'spw_id': [ddi_xds['spectral_window_id'].values[ddi]], 'pol_id': [ddi_xds['polarization_id'].values[ddi]], 'chan_width': chan_width, 'effective_bw': effective_bw, 'resolution': resolution } aux_xds = xarray.Dataset(coords=coords, attrs=attrs) aux_xds.to_zarr(os.path.join(outfile, 'xds' + str(ddi)), mode='a', compute=True, consolidated=True) xds = xarray.open_zarr(os.path.join(outfile, 'xds' + str(ddi))) xds_list += [('xds' + str(ddi), xds)] print('Completed ddi %i process time {:0.2f} s'.format(time.time() - start_ddi) % ddi) # clean up the tmp directory created by the weight conversion to MSv3 os.system("rm -fr " + os.path.join(outfile, 'tmp')) # convert other subtables to their own partitions, denoted by 'global_' prefix skip_tables = ['DATA_DESCRIPTION', 'SORTED_TABLE'] + ignore subtables = sorted([ tt for tt in os.listdir(infile) if os.path.isdir(os.path.join(infile, tt)) and tt not in skip_tables ]) if 'global' in ddis: start_ddi = time.time() for ii, subtable in enumerate(subtables): print('processing subtable %i of %i : %s' % (ii, len(subtables), subtable), end='\r') if subtable == 'POINTING': # expand the dimensions of the pointing table xds_sub_list = [(subtable, tblconv.convert_expanded_table( infile, os.path.join(outfile, 'global'), subtable=subtable, keys={ 'TIME': 'time', 'ANTENNA_ID': 'antenna_id' }, timecols=['time'], chunk_shape=chunk_shape))] else: xds_sub_list = [(subtable, tblconv.convert_simple_table( infile, os.path.join(outfile, 'global'), subtable, timecols=['TIME'], ignore=ignorecols, compressor=compressor, nofile=False))] if len(xds_sub_list[-1][1].dims) != 0: # to conform to MSv3, we need to add explicit ID fields to certain tables if subtable in [ 'ANTENNA', 'FIELD', 'OBSERVATION', 'SCAN', 'SPECTRAL_WINDOW', 'STATE' ]: #if 'd0' in xds_sub_list[-1][1].dims: aux_xds = xarray.Dataset( coords={ subtable.lower() + '_id': xarray.DataArray(xds_sub_list[-1][1].d0.values, dims=['d0']) }) aux_xds.to_zarr(os.path.join(outfile, 'global/' + subtable), mode='a', compute=True, consolidated=True) xds_sub_list[-1] = (subtable, xarray.open_zarr( os.path.join( outfile, 'global/' + subtable))) xds_list += xds_sub_list #else: # print('Empty Subtable:',subtable) print( 'Completed subtables process time {:0.2f} s'.format(time.time() - start_ddi)) # write sw version that did this conversion to zarr directory with open(outfile + '/.version', 'w') as fid: fid.write('cngi-protoype ' + importlib_metadata.version('cngi-prototype') + '\n') # build the master xds to return mxds = xdsio.vis_xds_packager(xds_list) print(' ' * 50) return mxds
def read_vis(infile, partition=None, chunks=None, consolidated=True, overwrite_encoded_chunks=True): """ Read zarr format Visibility data from disk to xarray Dataset Parameters ---------- infile : str input Visibility filename partition : string or list name of partition(s) to read as returned by describe_vis. Multiple partitions in list form will return a master dataset of datasets. Use 'global' for global metadata. Default None returns everything chunks : dict sets specified chunk size per dimension. Dict is in the form of 'dim':chunk_size, for example {'time':100, 'baseline':400, 'chan':32, 'pol':1}. Default None uses the original zarr chunking. consolidated : bool use zarr consolidated metadata capability. Only works for stores that have already been consolidated. Default True works with datasets produced by convert_ms which automatically consolidates metadata. overwrite_encoded_chunks : bool drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes. Default True, only applies when chunks is not None. Returns ------- xarray.core.dataset.Dataset New xarray Dataset of Visibility data contents """ import os import numpy as np import cngi._utils._io as xdsio from xarray import open_zarr infile = os.path.expanduser(infile) if partition is None: partition = os.listdir(infile) partition = np.atleast_1d(partition) if chunks is None: chunks = 'auto' overwrite_encoded_chunks = False if ('global' in partition) and (os.path.isdir( os.path.join(infile, 'global'))): global_dirs = sorted([ 'global/' + tt for tt in os.listdir(os.path.join(infile, 'global')) ]) partition = np.hstack( (np.delete(partition, np.where(partition == 'global')), global_dirs)) if len(partition) == 1: xds = open_zarr(os.path.join(infile, str(partition[0])), chunks=chunks, consolidated=consolidated, overwrite_encoded_chunks=overwrite_encoded_chunks) else: xds_list = [] for part in partition: if os.path.isdir(os.path.join(infile, str(part))): xds_list += [ (part.replace('global/', ''), open_zarr( os.path.join(infile, str(part)), chunks=chunks, consolidated=consolidated, overwrite_encoded_chunks=overwrite_encoded_chunks)) ] # build the master xds to return xds = xdsio.vis_xds_packager(xds_list) return xds
def read_ms(infile, ddis=None, ignore=None, chunks=(400, 400, 64, 2)): """ Read legacy format MS to xarray Visibility Dataset The CASA MSv2 format is converted to the MSv3 schema per the specified definition here: https://drive.google.com/file/d/10TZ4dsFw9CconBc-GFxSeb2caT6wkmza/view?usp=sharing The MS is partitioned by DDI, which guarantees a fixed data shape per partition. This results in separate xarray dataset (xds) partitions contained within a main xds (mxds). There is no DDI in MSv3, so this simply serves as a partition id for each xds. Parameters ---------- infile : str Input MS filename ddis : list List of specific DDIs to read. DDI's are integer values, or use 'global' string for subtables. Leave as None to read entire MS ignore : list List of subtables to ignore (case sensitive and generally all uppercase). This is useful if a particular subtable is causing errors or is very large and slowing down reads. Default is None chunks: 4-D tuple of ints Shape of desired chunking in the form of (time, baseline, channel, polarization). Larger values reduce the number of chunks and speed up the reads at the cost of more memory. Chunk size is the product of the four numbers. Default is (400, 400, 64, 2) Returns ------- xarray.core.dataset.Dataset Main xarray dataset of datasets for this visibility set """ import os import xarray import dask.array as da import numpy as np import cngi._utils._table_conversion2 as tblconv import cngi._utils._io as xdsio import warnings warnings.filterwarnings('ignore', category=FutureWarning) # parse filename to use infile = os.path.expanduser(infile) # as part of MSv3 conversion, these columns in the main table are no longer needed ignorecols = ['FLAG_CATEGORY', 'FLAG_ROW', 'DATA_DESC_ID'] if ignore is None: ignore = [] # we need to assume an explicit ordering of dims dimorder = ['time', 'baseline', 'chan', 'pol'] # we need the spectral window, polarization, and data description tables for processing the main table spw_xds = tblconv.read_simple_table(infile, subtable='SPECTRAL_WINDOW', ignore=ignorecols, add_row_id=True) pol_xds = tblconv.read_simple_table(infile, subtable='POLARIZATION', ignore=ignorecols) ddi_xds = tblconv.read_simple_table(infile, subtable='DATA_DESCRIPTION', ignore=ignorecols) # let's assume that each DATA_DESC_ID (ddi) is a fixed shape that may differ from others # form a list of ddis to process, each will be placed it in its own xarray dataset and partition if ddis is None: ddis = list(ddi_xds['d0'].values) + ['global'] else: ddis = np.atleast_1d(ddis) xds_list = [] #################################################################### # process each selected DDI from the input MS, assume a fixed shape within the ddi (should always be true) # each DDI is written to its own subdirectory under the parent folder for ddi in ddis: if ddi == 'global': continue # handled afterwards ddi = int(ddi) # convert columns that are common to MSv2 and MSv3 xds = tblconv.read_main_table(infile, subsel=ddi, ignore=ignorecols, chunks=chunks) if len(xds.dims) == 0: continue # convert and append the ANTENNA1 and ANTENNA2 columns separately so we can squash the unnecessary time dimension xds = xds.assign({ 'ANTENNA1': xds.ANTENNA1.max(axis=0), 'ANTENNA2': xds.ANTENNA2.max(axis=0) }) # MSv3 changes to weight/sigma column handling # 1. DATA_WEIGHT = 1/sqrt(SIGMA) # 2. CORRECTED_DATA_WEIGHT = WEIGHT # 3. if SIGMA_SPECTRUM or WEIGHT_SPECTRUM present, use them instead of SIGMA and WEIGHT # 4. discard SIGMA, WEIGHT, SIGMA_SPECTRUM and WEIGHT_SPECTRUM from converted ms # 5. set shape of DATA_WEIGHT / CORRECTED_DATA_WEIGHT to (time, baseline, chan, pol) padding as necessary if 'DATA' in xds.data_vars: if 'SIGMA_SPECTRUM' in xds.data_vars: xds = xds.assign({ 'DATA_WEIGHT': 1 / xds.SIGMA_SPECTRUM**2 }).drop('SIGMA_SPECTRUM') elif 'SIGMA' in xds.data_vars: wts = xds.SIGMA.shape[:2] + (1, ) + (xds.SIGMA.shape[-1], ) wt_da = da.tile(da.reshape(xds.SIGMA.data, wts), (1, 1, len(xds.chan), 1)).rechunk(chunks) xds = xds.assign({ 'DATA_WEIGHT': xarray.DataArray(1 / wt_da**2, dims=dimorder) }) if 'CORRECTED_DATA' in xds.data_vars: if 'WEIGHT_SPECTRUM' in xds.data_vars: xds = xds.rename({'WEIGHT_SPECTRUM': 'CORRECTED_DATA_WEIGHT'}) elif 'WEIGHT' in xds.data_vars: wts = xds.WEIGHT.shape[:2] + (1, ) + (xds.WEIGHT.shape[-1], ) wt_da = da.tile(da.reshape(xds.WEIGHT.data, wts), (1, 1, len(xds.chan), 1)).rechunk(chunks) xds = xds.assign({ 'CORRECTED_DATA_WEIGHT': xarray.DataArray(wt_da, dims=dimorder) }).drop('WEIGHT') xds = xds.drop_vars( ['WEIGHT', 'SIGMA', 'SIGMA_SPECTRUM', 'WEIGHT_SPECTRUM'], errors='ignore') # add in relevant data grouping, spw and polarization attributes attrs = {'data_groups': [{}]} if ('DATA' in xds.data_vars) and ('DATA_WEIGHT' in xds.data_vars): attrs['data_groups'][0][str(len(attrs['data_groups'][0]))] = { 'id': str(len(attrs['data_groups'][0])), 'data': 'DATA', 'uvw': 'UVW', 'flag': 'FLAG', 'weight': 'DATA_WEIGHT' } if ('CORRECTED_DATA' in xds.data_vars) and ('CORRECTED_DATA_WEIGHT' in xds.data_vars): attrs['data_groups'][0][str(len(attrs['data_groups'][0]))] = { 'id': str(len(attrs['data_groups'][0])), 'data': 'CORRECTED_DATA', 'uvw': 'UVW', 'flag': 'FLAG', 'weight': 'CORRECTED_DATA_WEIGHT' } for dv in spw_xds.data_vars: attrs[dv.lower()] = spw_xds[dv].values[ ddi_xds['spectral_window_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools for dv in pol_xds.data_vars: attrs[dv.lower()] = pol_xds[dv].values[ ddi_xds['polarization_id'].values[ddi]] attrs[dv.lower()] = int(attrs[dv.lower()]) if type(attrs[dv.lower( )]) is np.bool_ else attrs[dv.lower()] # convert bools # grab the channel frequency values from the spw table data and pol idxs from the polarization table, add spw and pol ids chan = attrs.pop('chan_freq')[:len(xds.chan)] pol = attrs.pop('corr_type')[:len(xds.pol)] # truncate per-chan values to the actual number of channels and move to coordinates chan_width = xarray.DataArray(da.from_array( attrs.pop('chan_width')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) effective_bw = xarray.DataArray(da.from_array( attrs.pop('effective_bw')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) resolution = xarray.DataArray(da.from_array( attrs.pop('resolution')[:len(xds.chan)], chunks=chunks[2]), dims=['chan']) coords = { 'chan': chan, 'pol': pol, 'spw_id': [ddi_xds['spectral_window_id'].values[ddi]], 'pol_id': [ddi_xds['polarization_id'].values[ddi]], 'chan_width': chan_width, 'effective_bw': effective_bw, 'resolution': resolution } xds = xds.assign_coords(coords).assign_attrs(attrs) xds_list += [('xds' + str(ddi), xds)] # read other subtables skip_tables = ['DATA_DESCRIPTION', 'SORTED_TABLE'] + ignore subtables = sorted([ tt for tt in os.listdir(infile) if os.path.isdir(os.path.join(infile, tt)) and tt not in skip_tables ]) if 'global' in ddis: for ii, subtable in enumerate(subtables): if subtable == 'POINTING': # expand the dimensions of the pointing table sxds = tblconv.read_pointing_table( os.path.join(infile, subtable), chunks=chunks[:2] + (20, 20)) else: add_row_id = (subtable in [ 'ANTENNA', 'FIELD', 'OBSERVATION', 'SCAN', 'SPECTRAL_WINDOW', 'STATE' ]) sxds = tblconv.read_simple_table(infile, subtable=subtable, timecols=['TIME'], ignore=ignorecols, add_row_id=add_row_id) if len(sxds.dims) != 0: xds_list += [(subtable, sxds)] # build the master xds to return mxds = xdsio.vis_xds_packager(xds_list) return mxds
def read_vis( infile, partition=None, chunks=None, consolidated=True, overwrite_encoded_chunks=True, **kwargs, ): """ Read zarr format Visibility data from disk to xarray Dataset Parameters ---------- infile : str input Visibility filename partition : string or list name of partition(s) to read as returned by describe_vis. Multiple partitions in list form will return a master dataset of datasets. Use 'global' for global metadata. Default None returns everything chunks : dict sets specified chunk size per dimension. Dict is in the form of 'dim':chunk_size, for example {'time':100, 'baseline':400, 'chan':32, 'pol':1}. Default None uses the original zarr chunking. consolidated : bool use zarr consolidated metadata capability. Only works for stores that have already been consolidated. Default True works with datasets produced by convert_ms which automatically consolidates metadata. overwrite_encoded_chunks : bool drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes. Default True, only applies when chunks is not None. s3_key : string, optional optional support for explicit authentication if infile is provided as S3 URL. If S3 url is passed as input but this argument is not specified then only publicly-available, read-only buckets are accessible (so output dataset will be read-only). s3_secret : string, optional optional support for explicit authentication if infile is provided as S3 URL. If S3 url is passed as input but this argument is not specified then only publicly-available, read-only buckets are accessible (so output dataset will be read-only). Returns ------- xarray.core.dataset.Dataset New xarray Dataset of Visibility data contents """ import os import numpy as np import cngi._utils._io as xdsio from xarray import open_zarr if chunks is None: chunks = "auto" overwrite_encoded_chunks = False if infile.lower().startswith("s3"): # for treating AWS object storage as a "file system" import s3fs if "s3_key" and "s3_secret" in kwargs: # plaintext authentication is a security hazard that must be patched ASAP # boto3 can be used instead, see https://s3fs.readthedocs.io/en/latest/#credentials # if we instead choose to extend the current solution, might want to santiize inputs s3 = s3fs.S3FileSystem( anon=False, requester_pays=False, key=kwargs["s3_key"], secret=kwargs["s3_secret"], ) else: # only publicly-available, read-only buckets will work. Should probably catch the exception here... s3 = s3fs.S3FileSystem(anon=True, requester_pays=False) # expect a path style URI to file link, e.g., # 's3://cngi-prototype-test-data/2017.1.00271.S/member.uid___A001_X1273_X2e3_split_cal_concat_target_regrid.vis.zarr/xds0/' s3_url = infile.split(sep="//", maxsplit=1)[1] # trim trailing slashes while s3_url.endswith("/"): s3_url = s3_url[:-1] if s3.isdir(s3_url): # this conditional is first otherwise there's no point to continue contents_map = s3.listdir(s3_url) object_names = [ object_dict["name"].split("/")[-1] for object_dict in contents_map ] if "time" and "baseline" and "chan" and "pol" in object_names: # looks like the input URI was one level too deep or s3_url points to a pre-0.0.65 xds *shivers* if partition is None: partition = s3_url.split("/")[-1] if partition != s3_url.split("/")[-1]: # includes case of empty partition kwarg but included in infile string # we should agree on doing something more solid here # e.g., isinstance(partition, str) and isinstance(partition, list) print( "Input to partition keyword argument does not match provided S3 URI" ) partition = s3_url.split("/")[-1] print(f"Assigning partition = {partition}") s3_url = ("/").join(s3_url.split("/")[:-1]) # at this point, s3_url should be compatible and reference top level of a mxds if partition is None: # avoid the .version object contents_map = s3.listdir(s3_url)[1:] object_names = [ object_dict["name"].split("/")[-1] for object_dict in contents_map ] object_names = [ oname for oname in object_names if not oname.startswith(".") ] partition = object_names if "global" in partition: # attempt to replicate behavior of os.listdir (i.e., ignore .zattrs etc.) contents_map_global = s3.listdir("/".join([s3_url, "global"])) olg_dirs = [ odg["name"].split("/")[-1] for odg in contents_map_global if odg["StorageClass"] == "DIRECTORY" ] global_dirs = sorted(["global/" + od for od in olg_dirs]) if isinstance(partition, list): partition.remove("global") partition = np.asarray(partition + global_dirs) else: partition = np.hstack(( np.delete(partition, np.where(partition == "global")), global_dirs, )) # now ready to read xds_list = [] if isinstance(partition, np.ndarray): for part in partition: uri = "/".join([s3_url, str(part)]) if s3.isdir(uri): INPUT = s3fs.S3Map(root=uri, s3=s3, check=False) xds_list += [( uri.replace(s3_url + "/", "").replace("global/", ""), open_zarr( INPUT, chunks=chunks, consolidated=consolidated, overwrite_encoded_chunks= overwrite_encoded_chunks, ), )] else: print( f"Requested partition {part} not found in dataset") else: # this case should hit only for single str input (unencased by list) to partition kwarg uri = "/".join([s3_url, partition]) INPUT = s3fs.S3Map(root=uri, s3=s3, check=False) xds = open_zarr( INPUT, chunks=chunks, consolidated=consolidated, overwrite_encoded_chunks=overwrite_encoded_chunks, ) xds_list.append(xds) else: # the non-s3 case, access data via local filesystem infile = os.path.expanduser(infile) if partition is None: partition = os.listdir(infile) partition = list(np.atleast_1d(partition)) if ("global" in partition) and (os.path.isdir( os.path.join(infile, "global"))): partition += sorted([ "global/" + tt for tt in os.listdir(os.path.join(infile, "global")) ]) xds_list = [] for part in partition: if part == 'global': continue try: if os.path.isdir(os.path.join(infile, str(part))): xds_list += [( part.replace("global/", ""), open_zarr( os.path.join(infile, str(part)), chunks=chunks, consolidated=consolidated, overwrite_encoded_chunks=overwrite_encoded_chunks)) ] except: print('Can not open ', part) # build the master xds to return xds = xdsio.vis_xds_packager(xds_list) return xds