def nca_equivalence(ncfile1, ncfile2, variable='ta'): """ Do these two files describe the same content?""" # Let's start by comparing a few important things x = s3Dataset(ncfile1) y = s3Dataset(ncfile2) # First let's just check a data record xx = x.variables[variable] yy = y.variables[variable] assert ( xx.shape == yy.shape).all(), "CFA data arrays are not the same shape" assert len(xx.shape) == 4, "Unexpected variable shape for comparison" xx = xx[:, 0, 0, 0].flatten() yy = yy[:, 0, 0, 0].flatten() # We don't do all data coz it would take a long time assert (xx == yy).all(), "Data in arrays does not match" x.close() y.close() # now check file headers raise NotImplementedError( "This doesn't mean the test has failed, just the test code is not finished" )
def create_partitions_from_files(out_dataset, files, axis, cfa_version, common_date): """Create the CFA partitions from a list of files.""" # loop over the files and open as a regular netCDF4 Dataset for fname in files: in_dataset = s3Dataset(fname, "r") # get the global metadata in_dataset_attrs = { x: in_dataset.getncattr(x) for x in in_dataset.ncattrs() } # add the attributes to the s3Dataset by updating the dictionary out_dataset._cfa_dataset.metadata.update(in_dataset_attrs) # loop over the groups for grp in in_dataset.groups: in_group = in_dataset[grp] # create a group if one with this name does not exist if grp not in out_dataset.groups: out_group = out_dataset.createGroup(grp) else: out_group = out_dataset.groups[grp] # update the metadata in_group_attrs = { x: in_group.getncattr(x) for x in in_group.ncattrs() } out_group._cfa_grp.metadata.update(in_group_attrs) add_var_dims(in_group, out_group, axis, fname, common_date) # add the variables in the root group add_var_dims(in_dataset, out_dataset, axis, fname, common_date) in_dataset.close()
def test_s3Dataset_read(path_stub, format="NETCDF4", cfa_version=None): """Test writing out a s3Dataset, for one of the various permutations of: 1. file format (netCDF3 or netCDF4) 2. whether it is a S3-netCDF / CFA file or a plain netCDF file 3. the CFA version (0.4 or 0.5) """ file_name = get_file_path(path_stub, format, cfa_version) if DEBUG: print("Test reading {}".format(file_name)) # open the dataset dr = s3Dataset(file_name, mode='r') if DEBUG: print(dr.groups) if format == "NETCDF4" or format == "CFA4": grp = dr.groups["test_group"] else: grp = dr if DEBUG: print(grp.variables["tmp"]) print(dr.variables["scl"]) tmp_var = grp.variables["tmp"] x = tmp_var[:, 0, 0, 0] dr.close() return True
def read_s3nc_serial(self): # S3netcdf serial read method fp = os.path.join(self.config['path'], self.config['file']) total_bytes = 0 for ir in range(self.config['repeats']): nc = s3Dataset(fp, 'r') var = nc.variables[config['var']] for i in range(var.shape[0]): logging.debug('Index: {}'.format(i)) data = var[i, :, :, :] total_bytes += data.nbytes nc.close() return total_bytes
def read_s3nc_map(self): # s3netcdf map read fp = os.path.join(self.config['path'], self.config['file']) total_bytes = 0 for ir in range(self.config['repeats']): nc = s3Dataset(fp, 'r') var = nc.variables[config['var']] rx = randint(0, var.shape[0] - 1) ry = randint(0, var.shape[1] - 1) logging.debug('Index: [{},{},:,:]'.format(rx, ry)) data = var[rx, ry, :, :] total_bytes += data.nbytes nc.close() return total_bytes
def read_s3nc_timeseries(self): #s3netcdf time series read fp = os.path.join(self.config['path'], self.config['file']) total_bytes = 0 for ir in range(self.config['repeats']): nc = s3Dataset(fp, 'r') var = nc.variables[config['var']] rx = randint(0, var.shape[1] - 1) ry = randint(0, var.shape[2] - 1) logging.debug('Index: [:,{},{},:]'.format(rx, ry)) data = var[:, rx, ry, 0] total_bytes += data.nbytes logging.debug('shape of results = {}'.format(data.shape)) nc.close() return total_bytes
def test_s3Dataset_write(path_stub, format="NETCDF4", cfa_version="0.4", resolution_degrees=1.5): """Test writing out a s3Dataset, for one of the various permutations of: 1. file format (netCDF3 or netCDF4) 2. whether it is a S3-netCDF / CFA file or a plain netCDF file 3. the CFA version (0.4 or 0.5) """ # build a file name from the path stub, the format and the cfa_version # don't use os.path.join as it doesn't handle URLs and paths file_name = get_file_path(path_stub, format, cfa_version) if DEBUG: print("Test writing {}".format(file_name)) # open the dataset ds = s3Dataset(file_name, format=format, mode='w', cfa_version=cfa_version, diskless=False, persist=False) # construct the shape: shape = [ 365, 1, 180.0 / resolution_degrees + 1, 360.0 / resolution_degrees ] # create the data inside the dataset create_test_dataset(ds, format, cfa_version, shape) if DEBUG: print(ds.groups["test_group"].variables["tmp"]) print(ds.variables["scl"]) if format == "CFA4" or format == "NETCDF4": tmp_var = ds.groups["test_group"].variables["tmp"] else: tmp_var = ds.variables["tmp"] tmp_var[:, :, :, :] = 250.0 vel_var = ds.variables["velocity"] vel_var[0] = 10.0 ds.close() return True
def aggregate_into_CFA(output_master_array, path, axis, cfa_version, common_date=None): """Aggregate the netCDF files in directory into a CFA master-array file""" # get the list of files first of all files = get_file_list(path) # create the s3Dataset # create the output master array file out_dataset = s3Dataset( output_master_array, mode='w', clobber=True, diskless=False, cfa_version=cfa_version ) # create the partitions from the list - these will be created in the order # that the files are read in create_partitions_from_files(out_dataset, files, axis, cfa_version, common_date) # we need to sort the partition matrices for each variable - i.e. there is # one matrix per variable sort_partition_matrices(out_dataset, axis) # close the dataset to write / upload it out_dataset.close()
def split_into_CFA( output_path, input_path, subarray_path="", subarray_shape=[], subarray_size=50 * 1024 * 1024, cfa_version="0.5", ): """Split a netCDF file into a number of subarray files and write the CFA master array file.""" # if the subarray path is empty then get it from the output_path if subarray_path == "": if ".nca" in output_path: subarray_path = output_path[:-4] elif ".nc" in output_path: subarray_path = output_path[:-3] else: subarray_path = output_path output_path += ".nca" # open the input file nc_ds = Dataset(input_path, 'r') # get the output format for the new Dataset # if it's netCDF4 then the output is CFA4 # if it's netCDF3 then the output is CFA3 if nc_ds.file_format in ['NETCDF4', 'NETCDF4_CLASSIC']: s3_file_format = "CFA4" elif nc_ds.file_format == "NETCDF3_CLASSIC": s3_file_format = "CFA3" else: raise CFAError("Cannot split file with format: {}".format( nc_ds.file_format)) # open the output file - copy the input from the input file to the output # file(s), whilst using the subarray settings to chunk the data s3_ds = s3Dataset(output_path, 'w', format=s3_file_format, cfa_version=cfa_version) # we now want to copy the information from the original dataset # netCDF files have: # global metadata # global dimensions # global variables # Each variable has # metadata # field data # # global groups # Each group has # metadata # dimensions # variables # Each variable has # metadata # field data # global metadata nc_md_keys = nc_ds.ncattrs() for k in nc_md_keys: s3_ds.setncattr(k, nc_ds.getncattr(k)) # global dimensions copy_dims(nc_ds, s3_ds) # global variables copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape) # now do the groups for grp in nc_ds.groups: nc_grp = nc_ds.groups[grp] # create s3 group in the s3 dataset s3_grp = s3_ds.createGroup(nc_grp.name) # copy group metadata nc_md_keys = nc_grp.ncattrs() for k in nc_md_keys: s3_ds.setncattr(k, nc_grp.getncattr(k)) # copy group dimensions copy_dims(nc_ds, s3_ds) # copy group variables copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape) # close the s3Dataset - super important as everything gets written on close s3_ds.close()
if args.variable: variable = args.variable else: variable = "all" if args.partition: # convert the partition string to a numpy array partition = args.partition else: partition = "none" if args.metadata: metadata = True else: metadata = False if input_file: # Get the input file. path = os.path.expanduser(input_file) input_dataset = s3Dataset(path, mode='r') # Print the global dataset information print_dataset_info( input_dataset, group, variable, partition, metadata ) #else:
metavar="<variable>", help=("Name of a variable to change file prefix for, or change all " "variables." "--variable=all|<variable_name>")) parser.add_argument( "--partition", action="store", default="all", metavar="<partition>", help=("Choose the partition to change the file location prefix for." "--partition=all<partition_index>")) parser.add_argument("--prefix", action="store", default="none", required=True, metavar="<prefix>", help=("New file location prefix")) args = parser.parse_args() # get the input file input_path = os.path.expanduser(args.input) # open the input dataset in append mode input_dataset = s3Dataset(input_path, mode='a') # Update the prefix in the partitions update_file_in_partitions(input_dataset, args.prefix, args.group, args.variable, args.partition) # close the file to save the changes input_dataset.close()