def get_month(path): pattern = r"\d{4}-\d{2}" s = re.search(pattern, path) if not s: con_message("error", f"Unable to find month string for {path}") sys.exit(1) return int(path[s.start() + 5:s.start() + 7])
def main(): parsed_args = parse_args() con_message("info", "Begin fix_mapfile_paths") mapfile_path = Path(parsed_args.mapfile_path) ware_base = parsed_args.warehouse_base pub_base = parsed_args.pub_base ware_version = "/" + parsed_args.warehouse_version + "/" pub_version = "/" + parsed_args.pub_version + "/" tempfile = NamedTemporaryFile(mode="w", delete=False, dir=str(mapfile_path.parent)) with open(mapfile_path, "r") as instream: for line in instream.readlines(): items = line.split("|") items[1] = items[1].replace(ware_base, pub_base, 1) items[1] = items[1].replace(ware_version, pub_version, 1) line = "|".join(items) tempfile.write(line) mapfile_temp = str(mapfile_path.resolve()) mapfile_path.unlink() move(tempfile.name, mapfile_temp) mapfile_temp = Path(mapfile_temp) mapfile_temp.chmod(0o664) con_message("info", f"Completed fix_mapfile_paths, mapfile={mapfile_temp}") return 0
def check_file(file, freq, idx, time_name="time"): """ Step through the file checking that each step in time is exactly how long it should be and that the time index is monotonically increasing """ prevtime = None first, last = None, None with xr.open_dataset(file, decode_times=False) as ds: if len(ds[time_name]) == 0: return None, None, idx for step in ds[time_name]: time = step.values.item() if not prevtime: prevtime = time first = time continue delta = time - prevtime if delta == 0: # monthly data return time, time, idx elif delta != freq: con_message( "warning", f"time discontinuity in {file} at {time}, delta was {delta} when it should have been {freq}", ) prevtime = time last = time return first, last, idx
def main(): parsed_args = parse_args() input_path = Path(parsed_args.input) if not input_path.exists() or not input_path.is_dir(): con_message("error", f"Input directory does not exist or is not a directory") return 1 futures = [] pool = ProcessPoolExecutor(max_workers=parsed_args.processes) for path in input_path.glob("*.nc"): realpath = str(path.resolve()) futures.append(pool.submit(check_file, realpath)) error = False try: for future in tqdm(as_completed(futures), total=len(futures)): result = future.result() if result == 1: error = True except KeyboardInterrupt: for future in futures: future.cancel() if error: return 1 return 0
def fix_units(inpath, outpath, time_units, offset): import xarray as xr with xr.open_dataset(inpath, decode_times=False) as ds: if ds.get("time") is None: con_message("error", f"{os.path.basename(inpath)} has no 'time' axis") exit(1) bnds_name = "time_bnds" if ds.get( "time_bnds") is not None else "time_bounds" if ds["time"].attrs.get("units") != time_units: ds = ds.assign_coords(time=ds["time"] + offset) if bnds_name == "time_bnds": ds = ds.assign_coords(time_bnds=ds[bnds_name] + offset) else: ds = ds.assign_coords(time_bounds=ds[bnds_name] + offset) if ds[bnds_name].values[0][0] == ds[bnds_name].values[0][1]: freq = ds[bnds_name].values[1][1] - ds[bnds_name].values[1][0] ds[bnds_name].values[0][0] -= freq ds["time"].attrs = { "long_name": "time", "units": time_units, "calendar": "noleap", "bounds": bnds_name, } ds["time_bnds"].attrs = {"long_name": "time interval endpoints"} ds.to_netcdf(outpath, unlimited_dims=["time"]) else: os.symlink(inpath, outpath)
def filter_files(file_info): to_remove = [] for combo in combinations(file_info, 2): if ( combo[0]["start"] == combo[1]["start"] and combo[0]["end"] == combo[1]["end"] ): con_message("debug", f"{combo[0]['name']} == {combo[1]['name']}") _, n1 = os.path.split(combo[0]["name"]) _, n2 = os.path.split(combo[1]["name"]) if int(n1[:8]) < int(n2[:8]): to_remove.append(combo[0]) else: to_remove.append(combo[1]) elif combo[0]["start"] == combo[1]["start"]: if combo[0]["end"] < combo[1]["end"]: to_remove.append(combo[0]) else: to_remove.append(combo[1]) for i1 in to_remove: for idx, i2 in enumerate(file_info): if i1 == i2: f = file_info.pop(idx) con_message("debug", f"removing {f['name']} from file list") break
def validate_args(args): """ Ensure the src path exists. Ensure the project is either E3SM or CMIP6 """ src_path = Path(args.src_path) if not src_path.exists(): con_message("error", "Source mapfile does not exist") return False return True
def loadFileLines(filepath: Path): retlist = [] if not filepath.exists(): con_message( "error", f"Cannot load lines from file {filepath} as it does not exist") sys.exit(1) with open(filepath.resolve(), "r") as instream: retlist = [Path(x.split("|")[1]).name for x in instream.readlines()] return retlist
def check_file(path): # import ipdb;ipdb.set_trace() cmd = f"ncdump -h {path}".split() proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() out = out.decode("utf-8") err = err.decode("utf-8") if not out or "NetCDF: HDF error" in err: con_message("error", f"Error loading {path}") return 1 return 0
def conduct_move(args, move_method="none"): if move_method == "none": con_message( "error", "Move_to_Publication: Must set move_method to 'move' or to 'link'") return 1 con_message("info", f"conduct_move: move_method = {move_method}") src_path = Path(args.src) dst_path = Path(args.dst) # move mapfile first. If fails, don't bother moving the files. # NOTE: This section should be removed once mapfile are only generated in final publication location. mapfile = next(src_path.parent.glob("*.map")) with open(mapfile, "r") as instream: dataset_id = instream.readline().split("|")[0].strip().split("#")[ 0] # just the first line, to obtain the dataset_id dst = Path(dst_path.parent, f"{dataset_id}.map") con_message("info", f"Moving the mapfile to {dst}") mapfile.replace(dst) message = f"mapfile_path={dst},pub_name={dst_path.name},ware_name={src_path.name}" if messages_path := os.environ.get("message_file"): with open(messages_path, "w") as outstream: outstream.write(message) con_message("info", f"{message}")
def main(): parsed_args = parse_args() success = validate_mapfile(Path(parsed_args.mapfile), Path(parsed_args.datapath), parsed_args.quiet) if success: if not parsed_args.quiet: con_message("info", "Mapfile includes all files") return 0 else: if not parsed_args.quiet: con_message("error", "Mapfile is missing one or more files") return 1
def main(): parsed_args = parse_args() input_path = Path(parsed_args.input) dataset_id = parsed_args.dataset_id version_nm = parsed_args.version_number numberproc = parsed_args.processes quiet = parsed_args.quiet if not input_path.exists() or not input_path.is_dir(): con_message("error", "Input directory does not exist or is not a directory") sys.exit(1) if outpath := parsed_args.outpath: outpath = Path(outpath)
def main(): parsed_args = parse_args() dataset = Dataset( dataset_id=parsed_args.dataset_id, no_status_file=True) status = dataset.get_esgf_status() ''' Elided until timing issue is resolved ''' ''' if status not in [DatasetStatus.SUCCESS.value, DatasetStatus.PUBLISHED.value]: con_message("error", f"ESGF validation failed, dataset in state {status}") if missing := dataset.missing: pprint(missing) return 1 else: con_message("info", f"ESGF validation success") return 0 ''' con_message("info", f"ESGF validation success (elision)") ''' elision ''' return 0
def monotonic_check(path, idx, bndsname): _, name = os.path.split(path) with xr.open_dataset(path, decode_times=False) as ds: # start at -1 so that the 0th time step doesnt trigger try: start_bound = ds[bndsname][0].values[0] end_bound = ds[bndsname][-1].values[-1] except IndexError as e: con_message( "info", "printing index error" ) # only to escape progress-bar prepend con_message("error", f"{name} doesnt have expect time_bnds variable shape") return None, None, idx l1, l2 = -1.0, -1.0 for bounds in ds[bndsname]: b1, b2 = bounds.values if (l1 == -1.0 and b1 == 0.0) or (b1 == b2): # the daily files have a 0 width time step at the start continue if b1 > l1 and b2 > l2: l1 = b1 l2 = b2 else: con_message( "error", f"{name} has failed the monotonically-increaseing time bounds check, {(b1, b2)} isn't greater than {(l1, l2)}", ) return None, None, idx return start_bound, end_bound, idx
def main(): parsed_args = parse_args() if not validate_args(parsed_args): sys.exit(1) src_path = Path(parsed_args.src) dst_path = Path(parsed_args.dst) if src_path == dst_path: con_message("info", "move_to_publication: move elided; src is dst") sys.exit(0) src_parent, _ = os.path.split(src_path) dst_parent, _ = os.path.split(dst_path) move_method = "move" if src_parent == dst_parent: move_method = "link" message = f"mapfile_path={next(src_path.parent.glob('*.map'))},pub_name={dst_path.name},ware_name={src_path.name}" if messages_path := os.environ.get("message_file"): with open(messages_path, "w") as outstream: outstream.write(message) con_message("info", message) else: con_message( "warning", f"cannot obtain message_file (from message_path) from environment for message {message}" )
def validate_mapfile(mapfile: str, srcdir: Path, quiet: bool): """ at this point, the srcdir should contain the datafiles (*.nc) and the parent dir/dsid.map, so we can do a name-by-name comparison. MUST test for each srcdir datafile in mapfile listing. Params: mapfile (str): the string path to the mapfile srcdir (Path): a Path object pointint to the directory containing the data files Returns: True if the mapfile is valid, False otherwise """ dataset_files = sorted([x.name for x in srcdir.glob("*.nc")]) mapfile_lines = sorted(loadFileLines(mapfile)) if not len(dataset_files) == len(mapfile_lines): con_message( "error", "Number of files does not match number of entries in the mapfile") sys.exit(1) # MUST assume both lists sort identically - O(n) > O(n^2) pairlist = list(zip(dataset_files, mapfile_lines)) error = [] # import ipdb; ipdb.set_trace() for file, mapentry in tqdm(pairlist, disable=quiet): if file not in mapentry: error.append(file) if error: for e in error: con_message("error", e) return False return True
def main(): parsed_args = parse_args() import xarray as xr file_one = Path(parsed_args.file_one) file_two = Path(parsed_args.file_two) vars_to_check = parsed_args.var_list if not file_one.exists() or not file_two.exists(): con_message("error", "One of more input files does not exist") return 1 data1 = xr.open_dataset(str(file_one.resolve()), decode_times=False) data2 = xr.open_dataset(str(file_two.resolve()), decode_times=False) all_match = True dont_match = [] for variable in tqdm(data1.data_vars): if ( ("all" not in vars_to_check and variable not in vars_to_check) or "bnds" in variable or variable in parsed_args.exclude ): continue a = data1[variable].load().values b = data2[variable].load().values if not allclose(a, b, equal_nan=True): dont_match.append(f"Values do not match for {variable}") all_match = False data1.close() data2.close() if all_match: con_message("info", "All variables match") return 0 con_message("warning", "Some variables do not match") for m in dont_match: con_message("debug", m) return 1
def validate_args(args): """ Ensure the src path exists and is not empty. Ensure the dst path exists and is empty. """ src_path = Path(args.src) dst_path = Path(args.dst) if not src_path.exists() or not src_path.is_dir(): con_message("error", "Source directory does not exist or is not a directory") return False if not any(src_path.iterdir()): con_message("error", "Source directory is empty") return False if not dst_path.exists() or not dst_path.is_dir(): dst_path.mkdir(parents=True, exist_ok=True) if any(dst_path.iterdir()): con_message("error", "Destination directory is not empty") return False return True
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-f", "--facets", required=True, nargs="*", help="Space separated key/value pairs for facets to get updated, in the form key=value", ) parser.add_argument("-c", "--cert", required=True, help="Path to ESGF cert") parser.add_argument( "-s", "--search", required=True, help="Search criteria, for example master_id=<dataset-id>, or experiment=1950-Control&model_version=1_0", ) parser.add_argument( "--index-node", default=DEFAULT_INDEX_NODE, help=f"default={DEFAULT_INDEX_NODE}", ) parser.add_argument( "--data-node", default=DEFAULT_DATA__NODE, help=f"default={DEFAULT_DATA__NODE}", ) parser.add_argument( "-y", "--yes", action="store_true", help="skip the manual verification" ) parser.add_argument( "--verbose", action="store_true", help="Print more verbose status messages" ) args = parser.parse_args() cert_path = args.cert search = args.search index_node = args.index_node data_node = args.data_node verbose = args.verbose facets = {} for item in args.facets: key, value = item.split("=") facets[key] = value # url = f"https://{index_node}/esg-search/search/?offset=0&limit=10000&type=Dataset&format=application%2Fsolr%2Bjson&latest=true&{search}" url = f"https://{index_node}/esg-search/search/?offset=0&limit=10000&type=Dataset&format=application%2Fsolr%2Bjson&{search}" if verbose: con_message("info", url) res = requests.get(url) if not res.status_code == 200: con_message("error", f"query return code: {res.status_code}") return 1 res = json.loads(res.text) docs = res["response"]["docs"] if len(docs) == 0: con_message("warning", f"Unable to find records matching search {search}") return 1 con_message("info", "Found the following datasets:") for doc in docs: con_message("info", f"\t{doc['id']}") if not args.yes: response = input( f"Found {len(docs)} datasets, would you like to update them all? y/[n]" ) if response.lower() != "y": con_message("warning", "User failed to answer 'y', exiting") return 1 import warnings warnings.filterwarnings("ignore") client = publisherClient(cert_path, index_node) for doc in tqdm(docs): dataset_id = doc["id"] update_record = gen_xml(dataset_id, "datasets", facets) if verbose: con_message("info", update_record) client.update(update_record) update_record = gen_xml(dataset_id, "files", facets) if verbose: con_message("info", update_record) client.update(update_record) return 0
if not input_path.exists() or not input_path.is_dir(): con_message("error", "Input directory does not exist or is not a directory") sys.exit(1) if outpath := parsed_args.outpath: outpath = Path(outpath) else: outpath = Path(f"{dataset_id}.map") if not outpath.exists(): outpath.touch(0o664) else: outpath.chmod(0o664) con_message("info", f"Generate_Mapfile: ({numberproc} processes) to {outpath}") futures = [] pool = ProcessPoolExecutor(max_workers=numberproc) for path in input_path.glob("*.nc"): futures.append(pool.submit(hash_file, path)) with open(outpath, "w") as outstream: try: for future in tqdm(as_completed(futures), total=len(futures), disable=quiet): filehash, pathstr = future.result() filestat = Path(pathstr).stat() line = f"{dataset_id}#{version_nm} | {pathstr} | {filestat.st_size} | mod_time={filestat.st_mtime} | checksum={filehash} | checksum_type=SHA256\n" outstream.write(line)
def main(): parser = argparse.ArgumentParser( description= "Check a directory of raw E3SM time-slice files for discontinuities in the time index" ) parser.add_argument("input", help="Directory path containing dataset") parser.add_argument( "-j", "--jobs", default=8, type=int, help="the number of processes, default is 8", ) parser.add_argument( "-q", "--quiet", action="store_true", default=False, help="Disable progress-bar for batch/background processing", ) args = parser.parse_args() inpath = args.input con_message("info", f"Running timechecker:dataset={inpath}") # collect all the files and sort them by their date stamp names = [ os.path.join(os.path.abspath(inpath), x) for x in os.listdir(inpath) if x.endswith(".nc") ] pattern = r"\d{4}-\d{2}" fileinfo = [] for name in names: start = re.search(pattern, name).start() if not start: con_message( "error", f"The year stamp search pattern {pattern} didn't find what it was expecting", ) sys.exit(1) fileinfo.append({ "prefix": name[:start], "suffix": name[start:], "name": name }) files = [x["name"] for x in sorted(fileinfo, key=lambda i: i["suffix"])] del fileinfo time_units, time_name = get_time_units(files[0]) monthly = False freq = None # find the time frequency by checking the delta from the 0th to the 1st step # import ipdb; ipdb.set_trace() with xr.open_dataset(files[0], decode_times=False) as ds: freq = ds.attrs.get("time_period_freq") if freq == "month_1": monthly = True con_message("info", "Found monthly data") calendar = ds[time_name].attrs["calendar"] if calendar not in calendars: con_message("error", f"Unsupported calendar type {calendar}") sys.exit(1) elif freq is None: if ds.attrs.get("title") == "CLM History file information": monthly = True calendar = ds[time_name].attrs["calendar"] if calendar not in calendars: con_message("error", f"Unsupported calendar type {calendar}") sys.exit(1) else: con_message("info", "Found sub-monthly data") freq = ds[time_name][1].values.item( ) - ds[time_name][0].values.item() con_message( "info", f"Detected frequency: {freq}, with units: {time_units}") # iterate over each of the files and get the first and last index from each file issues = list() prevtime = None indices = [None for _ in range(len(files))] with ProcessPoolExecutor(max_workers=args.jobs) as pool: futures = [ pool.submit(check_file, file, freq, idx) for idx, file in enumerate(files) ] for future in tqdm( as_completed(futures), total=len(futures), desc="Checking time indices", disable=args.quiet, ): first, last, idx = future.result() indices[idx] = (first, last, idx) prev = None for first, last, idx in indices: if not prev: prev = last continue if monthly: month = get_month(files[idx]) target = prev + calendars[calendar][month] else: target = prev + freq if not first or not last: # this file had an empty index, move on and start checking the next one as though this one was there msg = f"Empty time index found in {os.path.basename(files[idx])}" issues.append(msg) prev = target continue if first != target: msg = f"index issue file: {os.path.basename(files[idx])} has index {(first, last)} should be ({target, last}), the start index is off by ({first - target}) {time_units.split(' ')[0]}. " issues.append(msg) prev = last if issues: issues.append(f"Result=Fail:dataset={inpath}") for msg in issues: con_message("warning", msg) return 1 con_message("info", "No time index issues found.") con_message("info", f"Result=Pass:dataset={inpath}") return 0
def publish_dataset(args): """ Checks that a dataset isn't already available on ESGF, and if its not users the esgpublish utility to publish it Returns 0 if successful, 1 otherwise """ src_path = Path(args.src_path) optional_facets = None if args.optional_facets: optional_facets = {} for item in args.optional_facets: key, value = item.split("=") optional_facets[key] = value log_path = Path(args.log_path) if not log_path.exists(): log_path.mkdir(parents=True, exist_ok=True) # get the dataset_id from the mapfile with open(src_path, "r") as instream: line = instream.readline() dataset_id = line.split("|")[0].replace("#", ".v").strip() # check that this dataset doesnt already exist if "CMIP6" in dataset_id: project = "CMIP6" else: project = "e3sm" facets = {"instance_id": dataset_id, "type": "Dataset"} docs = search_esgf(project, facets) if docs and int(docs[0]["number_of_files"]) != 0: msg = f"Dataset {dataset_id} has already been published to ESGF and is marked as the latest version" con_message("error", msg) return 1 with open(src_path, "r") as instream: items = instream.readline().split("|") if "E3SM" in items[0].split(".")[0]: project = "e3sm" else: project = "cmip6" with TemporaryDirectory() as tmpdir: cmd = f"esgpublish --project {project} --map {src_path}" if project == "e3sm": if optional_facets is not None and optional_facets: project_metadata_path = os.path.join(tmpdir, f"{dataset_id}.json") with open(project_metadata_path, "w") as outstream: json.dump(optional_facets, outstream) cmd += f" --json {project_metadata_path}" con_message("info", f"Running: {cmd}") log = Path(log_path, f"{dataset_id}.log") con_message("info", f"Writing publication log to {log}") with open(log, "w") as logstream: # FOR TESTING ONLY # cmd = cmd + " --help" proc = Popen(cmd.split(), stdout=logstream, stderr=logstream, universal_newlines=True) proc.wait() # con_message("info", f"Return code {str(proc.returncode)} on cmd: {cmd}") # generated weird error in 3_Publish/slurm_scripts-20211013_185431_334581 return proc.returncode
mapfile = next(src_path.parent.glob("*.map")) with open(mapfile, "r") as instream: dataset_id = instream.readline().split("|")[0].strip().split("#")[ 0] # just the first line, to obtain the dataset_id dst = Path(dst_path.parent, f"{dataset_id}.map") con_message("info", f"Moving the mapfile to {dst}") mapfile.replace(dst) message = f"mapfile_path={dst},pub_name={dst_path.name},ware_name={src_path.name}" if messages_path := os.environ.get("message_file"): with open(messages_path, "w") as outstream: outstream.write(message) con_message("info", f"{message}") else: con_message("info", f"{message}") # DEBUG: return 1 so that files are NOT moved # return 1 # NOW move the files file_count = 0 for sfile in src_path.glob("*.nc"): # all .nc files destination = dst_path / sfile.name if destination.exists(): con_message( "error", f"Trying to move file {sfile} to {destination}, but the destination already exists", ) sys.exit(1)
def collect_segments(inpath, num_jobs, timename, bndsname): con_message("info", "starting segment collection") # collect all the files and sort them by their date stamp paths = [os.path.join(inpath, x) for x in os.listdir(inpath) if x.endswith(".nc")] for idx, path in enumerate(paths): if not os.path.getsize(path): _, n = os.path.split(path) con_message("warning", f"File {n} is zero bytes, skipping it") paths.pop(idx) with ProcessPoolExecutor(max_workers=num_jobs) as pool: futures = [ pool.submit(monotonic_check, path, idx, bndsname) for idx, path in enumerate(paths) ] file_info = [] for future in tqdm( as_completed(futures), desc="Checking files for monotonically increasing time indices", total=len(futures) ): b1, b2, idx = future.result() # if the first value is None, then the file failed its check # and the second value is the index of the file that failed if not b1: # we can simply not add the entry to the file_info list pass else: file_info.append({"name": paths[idx], "start": b1, "end": b2}) file_info.sort(key=lambda i: i["start"]) filter_files(file_info) # prime the segments by adding the first file f1 = file_info.pop(0) segments = {(f1["start"], f1["end"]): [f1["name"]]} while len(file_info) > 0: file = file_info.pop(0) joined = False for segstart, segend in segments.keys(): # the start of the file aligns with the end of the segment if segend == file["start"]: segments[(segstart, file["end"])] = segments.pop((segstart, segend)) + [ file["name"] ] joined = True break # the end of the file aligns with the start of the segment elif segstart == file["end"]: segments[(file["start"], segend)] = [file["name"]] + segments.pop( (segstart, segend) ) joined = True break if not joined: if file["start"] == 0.0: con_message( "error", f"the file {file['name']} has a start index of 0.0" ) sys.exit(1) if segments.get((file["start"], file["end"])): con_message( "error", f"the file {file['name']} has perfectly matching time indices with the previous segment {segments.get((file['start'], file['end']))}", ) sys.exit(1) segments[(file["start"], file["end"])] = [file["name"]] num_segments = len(segments) if num_segments > 10: con_message( "warning", f"There were {num_segments} found, this is high. Probably something wrong with the dataset", ) con_message("info", f"Found {num_segments} segments:") for seg in segments.keys(): con_message("info", f"Segment {seg} has length {len(segments[seg])}") # filter out segments that are completely contained by others combos = list(combinations(segments, 2)) for combo in combos: if combo[0][0] > combo[1][0] and combo[0][1] < combo[1][1]: segments.pop(combo[0]) elif combo[1][0] > combo[0][0] and combo[1][1] < combo[0][1]: segments.pop(combo[1]) return segments
def main(): parsed_args = parse_args() # populate and sort the list of FileItems # the list will be sorted based on its name futures = [] files = sorted( [ FileItem(units=None, path=str(x.resolve())) for x in Path(parsed_args.input).glob("*.nc") ], key=operator.attrgetter("path"), ) # setup a process pool, then iterate of all the files # for each file submit a job, and get a future, for its units # once the future returns, stick its output into its # corresponding position in the output array with ProcessPoolExecutor(max_workers=parsed_args.processes) as pool: for idx, item in enumerate(files): futures.append( pool.submit(check_file, idx, item.path, parsed_args.time_name)) pbar = tqdm(disable=parsed_args.quiet, total=len(files)) for future in as_completed(futures): pbar.update(1) idx, units = future.result() files[idx].units = units pbar.close() # walk through the files in order # the first file we find that doesnt match the expected units # is the first one in the bad batch # the offset output should be equal to the time of the first bad file # with the expected value (previous file end + freq) expected_units = files[0].units for idx, info in enumerate(files): if expected_units != files[idx].units: # we load the values of the previous file with xr.open_dataset(files[idx - 1].path, decode_times=False) as ds: freq = ds["time_bnds"].values[0][1] - ds["time_bnds"].values[ 0][0] prev_segment_end = ds["time"].values[-1] + freq # now we load the current file to get its first time value with xr.open_dataset(files[idx].path, decode_times=False) as ds: cur_segment_start = ds["time"].values[0] # we assume that the second file is always going to have a LOWER time value offset = prev_segment_end - cur_segment_start message = f"correct_units={expected_units},offset={offset}" if messages_path := os.environ.get("message_file"): with open(messages_path, "w") as outstream: outstream.write(message.replace(":", "^")) else: con_message("error", "could not obtain message_path from environment") con_message( "error", message ) # no idea if this should be info, warning or error return 1
def main(): desc = """This tool will search through a directory full of raw E3SM model time-slice output files, and find/fix any issues with the time index. If overlapping time segments are found, it will find the last file of the preceding segment and truncate it to match the index from the first file from the second segment.""" parser = argparse.ArgumentParser(description=desc) parser.add_argument( "input", help="The directory to check for time index issues, should only contain a single time-frequency from a single case" ) parser.add_argument( "--output", default="output", required=False, help=f"output directory for rectified dataset, default is {os.environ['PWD']}/output" ) parser.add_argument( "--move", action="store_true", required=False, help="move the files from the input directory into the output directory instead of symlinks" ) parser.add_argument( "--copy", action="store_true", required=False, help="copy the files from the input directory into the output directory instead of symlinks" ) parser.add_argument( "-j", "--jobs", default=8, type=int, help="the number of processes, default is 8" ) parser.add_argument( "--dryrun", action="store_true", help="Collect the time segments, but dont produce the truncated files or move anything" ) parser.add_argument( "--no-gaps", action="store_true", help="Exit if a time gap is discovered" ) parser.add_argument( "-q", "--quiet", action="store_true", help="Suppress progress bars" ) args = parser.parse_args() inpath = args.input outpath = args.output num_jobs = args.jobs dryrun = args.dryrun quiet = args.quiet if args.copy and args.move: con_message("error", "Both copy and move flags are set, please only pick one") return 1 if os.path.exists(outpath) and len(os.listdir(outpath)): con_message( "error", f"Output directory {outpath} already exists and contains files") return 1 else: os.makedirs(outpath, exist_ok=True) timename, bndsname = get_time_names(next(Path(inpath).glob("*")).as_posix()) segments = collect_segments(inpath, num_jobs, timename, bndsname) if len(segments) == 1: con_message("info", "No overlapping segments found") if dryrun: con_message("info", "not moving files") else: desc = "Placing files into output directory" _, files = segments.popitem() for src in tqdm(files, desc=desc, disable=quiet): _, name = os.path.split(src) dst = os.path.join(outpath, name) if os.path.exists(dst): continue if args.move: move_file(src, dst) elif args.copy: copyfile(src, dst) else: os.symlink(src, dst) return 0 ordered_segments = [] for start, end in segments.keys(): ordered_segments.append( {"start": start, "end": end, "files": segments[(start, end)]} ) ordered_segments.sort(key=lambda i: i["start"]) for s1, s2 in zip(ordered_segments[:-1], ordered_segments[1:]): if s2["start"] > s1["end"]: msg = f"There's a time gap between the end of {os.path.basename(s1['files'][-1])} and the start of {os.path.basename(s2['files'][0])} of {s2['start'] - s1['end']} " if args.no_gaps == True: outpath = Path(outpath) if not any(outpath.iterdir()): outpath.rmdir() con_message("error", msg) sys.exit(1) con_message("warning", msg) if not args.dryrun: con_message("info", "Moving files from the previous segment") desc = "Placing files into output directory" for src in tqdm(s1["files"], desc=desc, disable=quiet): _, name = os.path.split(src) dst = os.path.join(outpath, name) if os.path.exists(dst): continue if args.move: move_file(src, dst) elif args.copy: copyfile(src, dst) else: os.symlink(src, dst) if ordered_segments.index(s2) == len(ordered_segments) - 1: con_message("info", "Moving files from the last segment") desc = "Placing files into output directory" for src in tqdm(s2["files"], desc=desc, disable=quiet): _, name = os.path.split(src) dst = os.path.join(outpath, name) if os.path.exists(dst): continue if args.move: move_file(src, dst) elif args.copy: copyfile(src, dst) else: os.symlink(src, dst) continue to_truncate = None # the file that needs to be truncated # the index in the file list of segment 1 truncate_index = len(s1["files"]) for file in tqdm(s1["files"][::-1], disable=quiet, desc="Stepping backwards to find truncation point"): with xr.open_dataset(file, decode_times=False) as ds: if ds[bndsname][-1].values[1] > s2["start"]: truncate_index -= 1 continue else: break con_message( "info", f"removing {len(s1['files']) - truncate_index} files from ({s1['start']}, {s1['end']})", ) new_ds = xr.Dataset() to_truncate = s1["files"][truncate_index] with xr.open_dataset(to_truncate, decode_times=False) as ds: target_index = 0 for i in range(0, len(ds[bndsname])): if ds[bndsname][i].values[1] == s2["start"]: target_index += 1 break target_index += 1 con_message( "info", f"truncating {to_truncate} by removing {len(ds[bndsname]) - target_index} time steps", ) new_ds.attrs = ds.attrs for variable in ds.data_vars: if "time" not in ds[variable].coords and timename != "Time": new_ds[variable] = ds[variable] new_ds[variable].attrs = ds[variable].attrs continue if timename == "time": new_ds[variable] = ds[variable].isel(time=slice(0, target_index)) new_ds[variable].attrs = ds[variable].attrs else: new_ds[variable] = ds[variable].isel(Time=slice(0, target_index)) new_ds[variable].attrs = ds[variable].attrs ds[variable].encoding['_FillValue'] = False _, to_truncate_name = os.path.split(to_truncate) outfile_path = os.path.join(outpath, f"{to_truncate_name[:-3]}.trunc.nc") if dryrun: con_message("info", f"dryrun, not writing out file {outfile_path}") else: con_message("info", f"writing out {outfile_path}") new_ds.to_netcdf(outfile_path, unlimited_dims=[timename]) if dryrun: con_message("info", "dryrun, not moving files") else: desc = "Placing files into output directory" con_message("info", f"Moving the first {truncate_index} files") for src in tqdm(s1["files"][:truncate_index], desc=desc, disable=quiet): _, name = os.path.split(src) dst = os.path.join(outpath, name) if os.path.exists(dst): continue if args.move: move_file(src, dst) elif args.copy: copyfile(src, dst) else: os.symlink(src, dst) if dryrun: con_message("info", "dryrun, not moving files") else: con_message("info", "Moving files from the last segment") desc = "Placing files into output directory" for src in tqdm(ordered_segments[-1]["files"], desc=desc, disable=quiet): _, name = os.path.split(src) dst = os.path.join(outpath, name) if os.path.exists(dst): continue if args.move: move_file(src, dst) elif args.copy: copyfile(src, dst) else: os.symlink(src, dst) return 0