def aggregate_netcdf_group(job_dir, output_dir, files, group_name, request_id=None): datasets = [] accum_size = 0 parameters = analyze_datasets(job_dir, files, request_id=request_id) for f in sorted(files): path = os.path.join(job_dir, f) size = os.stat(path).st_size accum_size += size if accum_size > MAX_AGGREGATION_SIZE: concatenate_and_write(datasets, output_dir, group_name, request_id=request_id) accum_size = size datasets = [] with xr.open_dataset(path, decode_times=False, mask_and_scale=False, decode_coords=False) as ds: ds.load() shape_up(ds, parameters, request_id=request_id) datasets.append(ds) if datasets: concatenate_and_write(datasets, output_dir, group_name, request_id=request_id)
def analyze_datasets(dir_path, files, request_id=None): parameters = {} for f in files: path = os.path.join(dir_path, f) with xr.open_dataset(path, decode_times=False, mask_and_scale=False, decode_coords=False) as ds: for var in ds.data_vars: # drop the obs dimension shape = ds[var].shape[1:] dims = ds[var].dims[1:] dtype = ds[var].dtype if dtype.kind == 'S': fill = ds[var].attrs.get('_FillValue', FILL_VALUES.get('string')) else: fill = ds[var].attrs.get('_FillValue', FILL_VALUES.get(dtype.name)) if var not in parameters: parameters[var] = { 'shape': shape, 'dtype': dtype, 'dims': dims, 'fill': fill } else: parameters[var] = { 'shape': max_shape(shape, parameters[var]['shape']), 'dtype': max_dtype(dtype, parameters[var]['dtype']), 'dims': max_dimension_names(dims, parameters[var]['dims']), 'fill': fill } return parameters
def get_nc_info(file_name): with xr.open_dataset(file_name, decode_times=False, mask_and_scale=False, decode_coords=False) as ds: ret_val = { 'size': ds.obs.size, } for i in ATTRIBUTE_CARRYOVER_MAP: if i in ds.attrs: ret_val[i] = ds.attrs[i] ret_val['file_start_time'] = ds.time.values[-1] return ret_val