def test_parse_size(): tests = [('1000000.0b', 1000000), ('1MiB', 1048576), ('1.0MB', 1000000), ('0.001Gb', 1000000), ('500Mb', 500000000)] for test, check in tests: assert (parse_size(test) == check)
def test_parse_size(): tests = [ ("1000000.0b", 1000000), ("1MiB", 1048576), ("1.0MB", 1000000), ("0.001Gb", 1000000), ("500Mb", 500000000), ] for test, check in tests: assert parse_size(test) == check
def get_chunk_length(da): size = da.nbytes n_times = len(da.time.values) mem_limit = parse_size(chunk_memory_limit) if size > 0: n_chunks = math.ceil(size / mem_limit) else: n_chunks = 1 chunk_length = math.ceil(n_times / n_chunks) return chunk_length
def get_chunk_length(da): """ Calculate the chunk length to use when chunking xarray datasets. Based on memory limit provided in config and the size of th dataset. """ size = da.nbytes n_times = len(da.time.values) mem_limit = parse_size(chunk_memory_limit) if size > 0: n_chunks = math.ceil(size / mem_limit) else: n_chunks = 1 chunk_length = math.ceil(n_times / n_chunks) return chunk_length
def get_time_slices(ds, split_method, start=None, end=None, file_size_limit=None): """ Take an xarray Dataset or DataArray, assume it can be split on the time axis into a sequence of slices. Optionally, take a start and end date to specify a sub-slice of the main time axis. Use the prescribed file size limit to generate a list of ("YYYY-MM-DD", "YYYY-MM-DD") slices so that the output files do not (significantly) exceed the file size limit. :param ds: xarray Dataset :file_size_limit: a string specifying "<number><units>" :param start: :param end: :param file_size_limit: :param split_method: :return: list of tuples of date strings. """ if split_method != "time:auto": raise NotImplementedError( f"The split method {split_method} is not implemeted.") # Use default file size limit if not provided if not file_size_limit: file_size_limit = parse_size( CONFIG["clisops:write"]["file_size_limit"]) da = get_da(ds) times = filter_times_within(da.time.values, start=start, end=end) n_times = len(times) if n_times == 0: raise Exception("Zero time steps found between {start} and {end}.") n_slices = da.nbytes / file_size_limit slice_length = int(n_times // n_slices) if slice_length == 0: raise Exception( "Unable to calculate slice length for splitting output files.") slices = [] indx = 0 final_indx = n_times - 1 while indx <= final_indx: start_indx = indx indx += slice_length end_indx = indx - 1 if end_indx > final_indx: end_indx = final_indx slices.append((f"{_format_time(times[start_indx])}", f"{_format_time(times[end_indx])}")) return slices
def get_time_slices( ds: Union[xr.Dataset, xr.DataArray], split_method, start=None, end=None, file_size_limit: str = None, ) -> List[Tuple[str, str]]: """ Take an xarray Dataset or DataArray, assume it can be split on the time axis into a sequence of slices. Optionally, take a start and end date to specify a sub-slice of the main time axis. Use the prescribed file size limit to generate a list of ("YYYY-MM-DD", "YYYY-MM-DD") slices so that the output files do not (significantly) exceed the file size limit. Parameters ---------- ds: Union[xr.Dataset, xr.DataArray] split_method start end file_size_limit: str a string specifying "<number><units>". Returns ------- List[Tuple[str, str]] """ if split_method not in SUPPORTED_SPLIT_METHODS: raise NotImplementedError( f"The split method {split_method} is not implemented.") # Use default file size limit if not provided if not file_size_limit: file_size_limit = parse_size( CONFIG["clisops:write"]["file_size_limit"]) da = get_da(ds) slices = [] try: times = filter_times_within(da.time.values, start=start, end=end) # catch where "time" attribute cannot be accessed in ds except AttributeError: slices.append(None) return slices n_times = len(times) if n_times == 0: raise Exception(f"Zero time steps found between {start} and {end}.") n_slices = da.nbytes / file_size_limit slice_length = int(n_times // n_slices) if slice_length == 0: raise Exception( "Unable to calculate slice length for splitting output files.") indx = 0 final_indx = n_times - 1 while indx <= final_indx: start_indx = indx indx += slice_length end_indx = indx - 1 if end_indx > final_indx: end_indx = final_indx slices.append((f"{_format_time(times[start_indx])}", f"{_format_time(times[end_indx])}")) return slices