Exemple #1
0
def get_month(path):
    pattern = r"\d{4}-\d{2}"
    s = re.search(pattern, path)
    if not s:
        con_message("error", f"Unable to find month string for {path}")
        sys.exit(1)
    return int(path[s.start() + 5:s.start() + 7])
def main():
    parsed_args = parse_args()

    con_message("info", "Begin fix_mapfile_paths")
    mapfile_path = Path(parsed_args.mapfile_path)
    ware_base = parsed_args.warehouse_base
    pub_base = parsed_args.pub_base
    ware_version = "/" + parsed_args.warehouse_version + "/"
    pub_version = "/" + parsed_args.pub_version + "/"

    tempfile = NamedTemporaryFile(mode="w",
                                  delete=False,
                                  dir=str(mapfile_path.parent))
    with open(mapfile_path, "r") as instream:
        for line in instream.readlines():
            items = line.split("|")
            items[1] = items[1].replace(ware_base, pub_base, 1)
            items[1] = items[1].replace(ware_version, pub_version, 1)
            line = "|".join(items)
            tempfile.write(line)

    mapfile_temp = str(mapfile_path.resolve())
    mapfile_path.unlink()
    move(tempfile.name, mapfile_temp)
    mapfile_temp = Path(mapfile_temp)
    mapfile_temp.chmod(0o664)
    con_message("info", f"Completed fix_mapfile_paths, mapfile={mapfile_temp}")

    return 0
Exemple #3
0
def check_file(file, freq, idx, time_name="time"):
    """
    Step through the file checking that each step in time is exactly how long it should be
    and that the time index is monotonically increasing
    """
    prevtime = None
    first, last = None, None
    with xr.open_dataset(file, decode_times=False) as ds:
        if len(ds[time_name]) == 0:
            return None, None, idx
        for step in ds[time_name]:
            time = step.values.item()
            if not prevtime:
                prevtime = time
                first = time
                continue
            delta = time - prevtime
            if delta == 0:
                # monthly data
                return time, time, idx
            elif delta != freq:
                con_message(
                    "warning",
                    f"time discontinuity in {file} at {time}, delta was {delta} when it should have been {freq}",
                )
            prevtime = time
        last = time
    return first, last, idx
def main():
    parsed_args = parse_args()

    input_path = Path(parsed_args.input)

    if not input_path.exists() or not input_path.is_dir():
        con_message("error",
                    f"Input directory does not exist or is not a directory")
        return 1
    futures = []
    pool = ProcessPoolExecutor(max_workers=parsed_args.processes)
    for path in input_path.glob("*.nc"):
        realpath = str(path.resolve())
        futures.append(pool.submit(check_file, realpath))

    error = False
    try:
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            if result == 1:
                error = True
    except KeyboardInterrupt:
        for future in futures:
            future.cancel()

    if error:
        return 1
    return 0
Exemple #5
0
def fix_units(inpath, outpath, time_units, offset):
    import xarray as xr

    with xr.open_dataset(inpath, decode_times=False) as ds:
        if ds.get("time") is None:
            con_message("error",
                        f"{os.path.basename(inpath)} has no 'time' axis")
            exit(1)
        bnds_name = "time_bnds" if ds.get(
            "time_bnds") is not None else "time_bounds"
        if ds["time"].attrs.get("units") != time_units:
            ds = ds.assign_coords(time=ds["time"] + offset)
            if bnds_name == "time_bnds":
                ds = ds.assign_coords(time_bnds=ds[bnds_name] + offset)
            else:
                ds = ds.assign_coords(time_bounds=ds[bnds_name] + offset)
            if ds[bnds_name].values[0][0] == ds[bnds_name].values[0][1]:
                freq = ds[bnds_name].values[1][1] - ds[bnds_name].values[1][0]
                ds[bnds_name].values[0][0] -= freq
            ds["time"].attrs = {
                "long_name": "time",
                "units": time_units,
                "calendar": "noleap",
                "bounds": bnds_name,
            }
            ds["time_bnds"].attrs = {"long_name": "time interval endpoints"}
            ds.to_netcdf(outpath, unlimited_dims=["time"])
        else:
            os.symlink(inpath, outpath)
Exemple #6
0
def filter_files(file_info):
    to_remove = []
    for combo in combinations(file_info, 2):
        if (
            combo[0]["start"] == combo[1]["start"]
            and combo[0]["end"] == combo[1]["end"]
        ):
            con_message("debug", f"{combo[0]['name']} == {combo[1]['name']}")
            _, n1 = os.path.split(combo[0]["name"])
            _, n2 = os.path.split(combo[1]["name"])
            if int(n1[:8]) < int(n2[:8]):
                to_remove.append(combo[0])
            else:
                to_remove.append(combo[1])
        elif combo[0]["start"] == combo[1]["start"]:
            if combo[0]["end"] < combo[1]["end"]:
                to_remove.append(combo[0])
            else:
                to_remove.append(combo[1])

    for i1 in to_remove:
        for idx, i2 in enumerate(file_info):
            if i1 == i2:
                f = file_info.pop(idx)
                con_message("debug", f"removing {f['name']} from file list")
                break
def validate_args(args):
    """
    Ensure the src path exists.
    Ensure the project is either E3SM or CMIP6
    """
    src_path = Path(args.src_path)
    if not src_path.exists():
        con_message("error", "Source mapfile does not exist")
        return False

    return True
def loadFileLines(filepath: Path):
    retlist = []
    if not filepath.exists():
        con_message(
            "error",
            f"Cannot load lines from file {filepath} as it does not exist")
        sys.exit(1)

    with open(filepath.resolve(), "r") as instream:
        retlist = [Path(x.split("|")[1]).name for x in instream.readlines()]
    return retlist
def check_file(path):
    # import ipdb;ipdb.set_trace()
    cmd = f"ncdump -h {path}".split()
    proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
    out, err = proc.communicate()
    out = out.decode("utf-8")
    err = err.decode("utf-8")

    if not out or "NetCDF: HDF error" in err:
        con_message("error", f"Error loading {path}")
        return 1
    return 0
Exemple #10
0
def conduct_move(args, move_method="none"):
    if move_method == "none":
        con_message(
            "error",
            "Move_to_Publication: Must set move_method to 'move' or to 'link'")
        return 1

    con_message("info", f"conduct_move: move_method  = {move_method}")

    src_path = Path(args.src)
    dst_path = Path(args.dst)

    # move mapfile first. If fails, don't bother moving the files.
    # NOTE:  This section should be removed once mapfile are only generated in final publication location.

    mapfile = next(src_path.parent.glob("*.map"))
    with open(mapfile, "r") as instream:
        dataset_id = instream.readline().split("|")[0].strip().split("#")[
            0]  # just the first line, to obtain the dataset_id
    dst = Path(dst_path.parent, f"{dataset_id}.map")
    con_message("info", f"Moving the mapfile to {dst}")
    mapfile.replace(dst)

    message = f"mapfile_path={dst},pub_name={dst_path.name},ware_name={src_path.name}"
    if messages_path := os.environ.get("message_file"):
        with open(messages_path, "w") as outstream:
            outstream.write(message)
            con_message("info", f"{message}")
def main():

    parsed_args = parse_args()

    success = validate_mapfile(Path(parsed_args.mapfile),
                               Path(parsed_args.datapath), parsed_args.quiet)
    if success:
        if not parsed_args.quiet:
            con_message("info", "Mapfile includes all files")

        return 0
    else:
        if not parsed_args.quiet:
            con_message("error", "Mapfile is missing one or more files")
        return 1
def main():
    parsed_args = parse_args()

    input_path = Path(parsed_args.input)
    dataset_id = parsed_args.dataset_id
    version_nm = parsed_args.version_number
    numberproc = parsed_args.processes
    quiet = parsed_args.quiet

    if not input_path.exists() or not input_path.is_dir():
        con_message("error",
                    "Input directory does not exist or is not a directory")
        sys.exit(1)

    if outpath := parsed_args.outpath:
        outpath = Path(outpath)
Exemple #13
0
def main():
    parsed_args = parse_args()
    dataset = Dataset(
        dataset_id=parsed_args.dataset_id,
        no_status_file=True)
    status = dataset.get_esgf_status()
    ''' Elided until timing issue is resolved '''
    '''
    if status not in [DatasetStatus.SUCCESS.value, DatasetStatus.PUBLISHED.value]:
        con_message("error", f"ESGF validation failed, dataset in state {status}")
        if missing := dataset.missing:
            pprint(missing)
        return 1
    else:
        con_message("info", f"ESGF validation success")
        return 0
    '''
    con_message("info", f"ESGF validation success (elision)")
    ''' elision '''
    return 0
Exemple #14
0
def monotonic_check(path, idx, bndsname):
    _, name = os.path.split(path)
    with xr.open_dataset(path, decode_times=False) as ds:
        # start at -1 so that the 0th time step doesnt trigger
        try:
            start_bound = ds[bndsname][0].values[0]
            end_bound = ds[bndsname][-1].values[-1]
        except IndexError as e:
            con_message(
                "info", "printing index error"
            )  # only to escape progress-bar prepend
            con_message("error", f"{name} doesnt have expect time_bnds variable shape")
            return None, None, idx
        l1, l2 = -1.0, -1.0
        for bounds in ds[bndsname]:
            b1, b2 = bounds.values
            if (l1 == -1.0 and b1 == 0.0) or (b1 == b2):
                # the daily files have a 0 width time step at the start
                continue
            if b1 > l1 and b2 > l2:
                l1 = b1
                l2 = b2
            else:
                con_message(
                    "error",
                    f"{name} has failed the monotonically-increaseing time bounds check, {(b1, b2)} isn't greater than {(l1, l2)}",
                )
                return None, None, idx

        return start_bound, end_bound, idx
Exemple #15
0
def main():
    parsed_args = parse_args()

    if not validate_args(parsed_args):
        sys.exit(1)

    src_path = Path(parsed_args.src)
    dst_path = Path(parsed_args.dst)

    if src_path == dst_path:
        con_message("info", "move_to_publication: move elided; src is dst")
        sys.exit(0)

    src_parent, _ = os.path.split(src_path)
    dst_parent, _ = os.path.split(dst_path)

    move_method = "move"
    if src_parent == dst_parent:
        move_method = "link"
        message = f"mapfile_path={next(src_path.parent.glob('*.map'))},pub_name={dst_path.name},ware_name={src_path.name}"
        if messages_path := os.environ.get("message_file"):
            with open(messages_path, "w") as outstream:
                outstream.write(message)
                con_message("info", message)
        else:
            con_message(
                "warning",
                f"cannot obtain message_file (from message_path) from environment for message {message}"
            )
def validate_mapfile(mapfile: str, srcdir: Path, quiet: bool):
    """
    at this point, the srcdir should contain the datafiles (*.nc)
    and the parent dir/dsid.map, so we can do a name-by-name comparison.
    MUST test for each srcdir datafile in mapfile listing.

    Params:
        mapfile (str): the string path to the mapfile
        srcdir (Path): a Path object pointint to the directory containing the data files
    Returns:
        True if the mapfile is valid, False otherwise
    """

    dataset_files = sorted([x.name for x in srcdir.glob("*.nc")])
    mapfile_lines = sorted(loadFileLines(mapfile))

    if not len(dataset_files) == len(mapfile_lines):
        con_message(
            "error",
            "Number of files does not match number of entries in the mapfile")
        sys.exit(1)

    # MUST assume both lists sort identically - O(n) > O(n^2)
    pairlist = list(zip(dataset_files, mapfile_lines))
    error = []
    # import ipdb; ipdb.set_trace()
    for file, mapentry in tqdm(pairlist, disable=quiet):
        if file not in mapentry:
            error.append(file)

    if error:
        for e in error:
            con_message("error", e)
        return False

    return True
Exemple #17
0
def main():
    parsed_args = parse_args()
    import xarray as xr

    file_one = Path(parsed_args.file_one)
    file_two = Path(parsed_args.file_two)
    vars_to_check = parsed_args.var_list

    if not file_one.exists() or not file_two.exists():
        con_message("error", "One of more input files does not exist")
        return 1

    data1 = xr.open_dataset(str(file_one.resolve()), decode_times=False)
    data2 = xr.open_dataset(str(file_two.resolve()), decode_times=False)

    all_match = True
    dont_match = []
    for variable in tqdm(data1.data_vars):

        if (
            ("all" not in vars_to_check and variable not in vars_to_check)
            or "bnds" in variable
            or variable in parsed_args.exclude
        ):
            continue

        a = data1[variable].load().values
        b = data2[variable].load().values
        if not allclose(a, b, equal_nan=True):
            dont_match.append(f"Values do not match for {variable}")
            all_match = False

    data1.close()
    data2.close()

    if all_match:
        con_message("info", "All variables match")
        return 0

    con_message("warning", "Some variables do not match")

    for m in dont_match:
        con_message("debug", m)
    return 1
Exemple #18
0
def validate_args(args):
    """
    Ensure the src path exists and is not empty.
    Ensure the dst path exists and is empty.
    """
    src_path = Path(args.src)
    dst_path = Path(args.dst)
    if not src_path.exists() or not src_path.is_dir():
        con_message("error",
                    "Source directory does not exist or is not a directory")
        return False
    if not any(src_path.iterdir()):
        con_message("error", "Source directory is empty")
        return False

    if not dst_path.exists() or not dst_path.is_dir():
        dst_path.mkdir(parents=True, exist_ok=True)
    if any(dst_path.iterdir()):
        con_message("error", "Destination directory is not empty")
        return False

    return True
Exemple #19
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f",
        "--facets",
        required=True,
        nargs="*",
        help="Space separated key/value pairs for facets to get updated, in the form key=value",
    )
    parser.add_argument("-c", "--cert", required=True, help="Path to ESGF cert")
    parser.add_argument(
        "-s",
        "--search",
        required=True,
        help="Search criteria, for example master_id=<dataset-id>, or experiment=1950-Control&model_version=1_0",
    )
    parser.add_argument(
        "--index-node",
        default=DEFAULT_INDEX_NODE,
        help=f"default={DEFAULT_INDEX_NODE}",
    )
    parser.add_argument(
        "--data-node",
        default=DEFAULT_DATA__NODE,
        help=f"default={DEFAULT_DATA__NODE}",
    )
    parser.add_argument(
        "-y", "--yes", action="store_true", help="skip the manual verification"
    )
    parser.add_argument(
        "--verbose", action="store_true", help="Print more verbose status messages"
    )

    args = parser.parse_args()

    cert_path = args.cert
    search = args.search
    index_node = args.index_node
    data_node = args.data_node
    verbose = args.verbose

    facets = {}
    for item in args.facets:
        key, value = item.split("=")
        facets[key] = value

    # url = f"https://{index_node}/esg-search/search/?offset=0&limit=10000&type=Dataset&format=application%2Fsolr%2Bjson&latest=true&{search}"
    url = f"https://{index_node}/esg-search/search/?offset=0&limit=10000&type=Dataset&format=application%2Fsolr%2Bjson&{search}"
    if verbose:
        con_message("info", url)
    res = requests.get(url)

    if not res.status_code == 200:
        con_message("error", f"query return code: {res.status_code}")
        return 1

    res = json.loads(res.text)

    docs = res["response"]["docs"]
    if len(docs) == 0:
        con_message("warning", f"Unable to find records matching search {search}")
        return 1

    con_message("info", "Found the following datasets:")
    for doc in docs:
        con_message("info", f"\t{doc['id']}")

    if not args.yes:
        response = input(
            f"Found {len(docs)} datasets, would you like to update them all? y/[n]"
        )
        if response.lower() != "y":
            con_message("warning", "User failed to answer 'y', exiting")
            return 1

    import warnings

    warnings.filterwarnings("ignore")

    client = publisherClient(cert_path, index_node)
    for doc in tqdm(docs):
        dataset_id = doc["id"]
        update_record = gen_xml(dataset_id, "datasets", facets)
        if verbose:
            con_message("info", update_record)
        client.update(update_record)
        update_record = gen_xml(dataset_id, "files", facets)
        if verbose:
            con_message("info", update_record)
        client.update(update_record)

    return 0
    if not input_path.exists() or not input_path.is_dir():
        con_message("error",
                    "Input directory does not exist or is not a directory")
        sys.exit(1)

    if outpath := parsed_args.outpath:
        outpath = Path(outpath)
    else:
        outpath = Path(f"{dataset_id}.map")

    if not outpath.exists():
        outpath.touch(0o664)
    else:
        outpath.chmod(0o664)

    con_message("info",
                f"Generate_Mapfile: ({numberproc} processes) to {outpath}")

    futures = []
    pool = ProcessPoolExecutor(max_workers=numberproc)
    for path in input_path.glob("*.nc"):
        futures.append(pool.submit(hash_file, path))

    with open(outpath, "w") as outstream:
        try:
            for future in tqdm(as_completed(futures),
                               total=len(futures),
                               disable=quiet):
                filehash, pathstr = future.result()
                filestat = Path(pathstr).stat()
                line = f"{dataset_id}#{version_nm} | {pathstr} | {filestat.st_size} | mod_time={filestat.st_mtime} | checksum={filehash} | checksum_type=SHA256\n"
                outstream.write(line)
Exemple #21
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Check a directory of raw E3SM time-slice files for discontinuities in the time index"
    )
    parser.add_argument("input", help="Directory path containing dataset")
    parser.add_argument(
        "-j",
        "--jobs",
        default=8,
        type=int,
        help="the number of processes, default is 8",
    )
    parser.add_argument(
        "-q",
        "--quiet",
        action="store_true",
        default=False,
        help="Disable progress-bar for batch/background processing",
    )
    args = parser.parse_args()
    inpath = args.input

    con_message("info", f"Running timechecker:dataset={inpath}")

    # collect all the files and sort them by their date stamp
    names = [
        os.path.join(os.path.abspath(inpath), x) for x in os.listdir(inpath)
        if x.endswith(".nc")
    ]
    pattern = r"\d{4}-\d{2}"
    fileinfo = []
    for name in names:
        start = re.search(pattern, name).start()
        if not start:
            con_message(
                "error",
                f"The year stamp search pattern {pattern} didn't find what it was expecting",
            )
            sys.exit(1)
        fileinfo.append({
            "prefix": name[:start],
            "suffix": name[start:],
            "name": name
        })
    files = [x["name"] for x in sorted(fileinfo, key=lambda i: i["suffix"])]
    del fileinfo

    time_units, time_name = get_time_units(files[0])

    monthly = False
    freq = None
    # find the time frequency by checking the delta from the 0th to the 1st step
    # import ipdb; ipdb.set_trace()
    with xr.open_dataset(files[0], decode_times=False) as ds:
        freq = ds.attrs.get("time_period_freq")

        if freq == "month_1":
            monthly = True
            con_message("info", "Found monthly data")
            calendar = ds[time_name].attrs["calendar"]
            if calendar not in calendars:
                con_message("error", f"Unsupported calendar type {calendar}")
                sys.exit(1)
        elif freq is None:
            if ds.attrs.get("title") == "CLM History file information":
                monthly = True
            calendar = ds[time_name].attrs["calendar"]
            if calendar not in calendars:
                con_message("error", f"Unsupported calendar type {calendar}")
                sys.exit(1)
        else:
            con_message("info", "Found sub-monthly data")
            freq = ds[time_name][1].values.item(
            ) - ds[time_name][0].values.item()
            con_message(
                "info",
                f"Detected frequency: {freq}, with units: {time_units}")

    # iterate over each of the files and get the first and last index from each file
    issues = list()
    prevtime = None
    indices = [None for _ in range(len(files))]
    with ProcessPoolExecutor(max_workers=args.jobs) as pool:
        futures = [
            pool.submit(check_file, file, freq, idx)
            for idx, file in enumerate(files)
        ]

        for future in tqdm(
                as_completed(futures),
                total=len(futures),
                desc="Checking time indices",
                disable=args.quiet,
        ):
            first, last, idx = future.result()
            indices[idx] = (first, last, idx)

    prev = None
    for first, last, idx in indices:
        if not prev:
            prev = last
            continue
        if monthly:
            month = get_month(files[idx])
            target = prev + calendars[calendar][month]
        else:
            target = prev + freq
        if not first or not last:
            # this file had an empty index, move on and start checking the next one as though this one was there
            msg = f"Empty time index found in {os.path.basename(files[idx])}"
            issues.append(msg)
            prev = target
            continue
        if first != target:
            msg = f"index issue file: {os.path.basename(files[idx])} has index {(first, last)} should be ({target, last}), the start index is off by ({first - target}) {time_units.split(' ')[0]}. "
            issues.append(msg)
        prev = last

    if issues:
        issues.append(f"Result=Fail:dataset={inpath}")
        for msg in issues:
            con_message("warning", msg)
        return 1

    con_message("info", "No time index issues found.")
    con_message("info", f"Result=Pass:dataset={inpath}")
    return 0
def publish_dataset(args):
    """
    Checks that a dataset isn't already available on ESGF, and if its not
    users the esgpublish utility to publish it

    Returns 0 if successful, 1 otherwise
    """
    src_path = Path(args.src_path)
    optional_facets = None
    if args.optional_facets:
        optional_facets = {}
        for item in args.optional_facets:
            key, value = item.split("=")
            optional_facets[key] = value
    log_path = Path(args.log_path)
    if not log_path.exists():
        log_path.mkdir(parents=True, exist_ok=True)

    # get the dataset_id from the mapfile
    with open(src_path, "r") as instream:
        line = instream.readline()
    dataset_id = line.split("|")[0].replace("#", ".v").strip()

    # check that this dataset doesnt already exist
    if "CMIP6" in dataset_id:
        project = "CMIP6"
    else:
        project = "e3sm"
    facets = {"instance_id": dataset_id, "type": "Dataset"}
    docs = search_esgf(project, facets)

    if docs and int(docs[0]["number_of_files"]) != 0:
        msg = f"Dataset {dataset_id} has already been published to ESGF and is marked as the latest version"
        con_message("error", msg)
        return 1

    with open(src_path, "r") as instream:
        items = instream.readline().split("|")
        if "E3SM" in items[0].split(".")[0]:
            project = "e3sm"
        else:
            project = "cmip6"

    with TemporaryDirectory() as tmpdir:
        cmd = f"esgpublish --project {project} --map {src_path}"
        if project == "e3sm":
            if optional_facets is not None and optional_facets:
                project_metadata_path = os.path.join(tmpdir,
                                                     f"{dataset_id}.json")
                with open(project_metadata_path, "w") as outstream:
                    json.dump(optional_facets, outstream)
                cmd += f" --json {project_metadata_path}"

        con_message("info", f"Running: {cmd}")
        log = Path(log_path, f"{dataset_id}.log")
        con_message("info", f"Writing publication log to {log}")

        with open(log, "w") as logstream:
            # FOR TESTING ONLY
            # cmd = cmd + " --help"
            proc = Popen(cmd.split(),
                         stdout=logstream,
                         stderr=logstream,
                         universal_newlines=True)
            proc.wait()

        # con_message("info", f"Return code {str(proc.returncode)} on cmd: {cmd}")      # generated weird error in 3_Publish/slurm_scripts-20211013_185431_334581

        return proc.returncode
Exemple #23
0
    mapfile = next(src_path.parent.glob("*.map"))
    with open(mapfile, "r") as instream:
        dataset_id = instream.readline().split("|")[0].strip().split("#")[
            0]  # just the first line, to obtain the dataset_id
    dst = Path(dst_path.parent, f"{dataset_id}.map")
    con_message("info", f"Moving the mapfile to {dst}")
    mapfile.replace(dst)

    message = f"mapfile_path={dst},pub_name={dst_path.name},ware_name={src_path.name}"
    if messages_path := os.environ.get("message_file"):
        with open(messages_path, "w") as outstream:
            outstream.write(message)
            con_message("info", f"{message}")
    else:
        con_message("info", f"{message}")

    # DEBUG:  return 1 so that files are NOT moved
    # return 1

    # NOW move the files

    file_count = 0
    for sfile in src_path.glob("*.nc"):  # all .nc files
        destination = dst_path / sfile.name
        if destination.exists():
            con_message(
                "error",
                f"Trying to move file {sfile} to {destination}, but the destination already exists",
            )
            sys.exit(1)
Exemple #24
0
def collect_segments(inpath, num_jobs, timename, bndsname):

    con_message("info", "starting segment collection")
    # collect all the files and sort them by their date stamp
    paths = [os.path.join(inpath, x) for x in os.listdir(inpath) if x.endswith(".nc")]
    for idx, path in enumerate(paths):
        if not os.path.getsize(path):
            _, n = os.path.split(path)
            con_message("warning", f"File {n} is zero bytes, skipping it")
            paths.pop(idx)

    with ProcessPoolExecutor(max_workers=num_jobs) as pool:
        futures = [
            pool.submit(monotonic_check, path, idx, bndsname)
            for idx, path in enumerate(paths)
        ]
        file_info = []
        for future in tqdm(
            as_completed(futures),
            desc="Checking files for monotonically increasing time indices",
            total=len(futures)
        ):
            b1, b2, idx = future.result()
            # if the first value is None, then the file failed its check
            # and the second value is the index of the file that failed
            if not b1:
                # we can simply not add the entry to the file_info list
                pass
            else:
                file_info.append({"name": paths[idx], "start": b1, "end": b2})

    file_info.sort(key=lambda i: i["start"])

    filter_files(file_info)

    # prime the segments by adding the first file
    f1 = file_info.pop(0)
    segments = {(f1["start"], f1["end"]): [f1["name"]]}

    while len(file_info) > 0:

        file = file_info.pop(0)
        joined = False
        for segstart, segend in segments.keys():
            # the start of the file aligns with the end of the segment
            if segend == file["start"]:
                segments[(segstart, file["end"])] = segments.pop((segstart, segend)) + [
                    file["name"]
                ]
                joined = True
                break
            # the end of the file aligns with the start of the segment
            elif segstart == file["end"]:
                segments[(file["start"], segend)] = [file["name"]] + segments.pop(
                    (segstart, segend)
                )
                joined = True
                break
        if not joined:
            if file["start"] == 0.0:
                con_message(
                    "error", f"the file {file['name']} has a start index of 0.0"
                )
                sys.exit(1)
            if segments.get((file["start"], file["end"])):
                con_message(
                    "error",
                    f"the file {file['name']} has perfectly matching time indices with the previous segment {segments.get((file['start'], file['end']))}",
                )
                sys.exit(1)
            segments[(file["start"], file["end"])] = [file["name"]]

    num_segments = len(segments)
    if num_segments > 10:
        con_message(
            "warning",
            f"There were {num_segments} found, this is high. Probably something wrong with the dataset",
        )

    con_message("info", f"Found {num_segments} segments:")
    for seg in segments.keys():
        con_message("info", f"Segment {seg} has length {len(segments[seg])}")

    # filter out segments that are completely contained by others
    combos = list(combinations(segments, 2))
    for combo in combos:
        if combo[0][0] > combo[1][0] and combo[0][1] < combo[1][1]:
            segments.pop(combo[0])
        elif combo[1][0] > combo[0][0] and combo[1][1] < combo[0][1]:
            segments.pop(combo[1])
    return segments
Exemple #25
0
def main():
    parsed_args = parse_args()

    # populate and sort the list of FileItems
    # the list will be sorted based on its name
    futures = []
    files = sorted(
        [
            FileItem(units=None, path=str(x.resolve()))
            for x in Path(parsed_args.input).glob("*.nc")
        ],
        key=operator.attrgetter("path"),
    )

    # setup a process pool, then iterate of all the files
    # for each file submit a job, and get a future, for its units

    # once the future returns, stick its output into its
    # corresponding position in the output array
    with ProcessPoolExecutor(max_workers=parsed_args.processes) as pool:
        for idx, item in enumerate(files):
            futures.append(
                pool.submit(check_file, idx, item.path, parsed_args.time_name))

        pbar = tqdm(disable=parsed_args.quiet, total=len(files))
        for future in as_completed(futures):
            pbar.update(1)
            idx, units = future.result()
            files[idx].units = units
        pbar.close()

    # walk through the files in order
    # the first file we find that doesnt match the expected units
    # is the first one in the bad batch
    # the offset output should be equal to the time of the first bad file
    # with the expected value (previous file end + freq)
    expected_units = files[0].units
    for idx, info in enumerate(files):
        if expected_units != files[idx].units:
            # we load the values of the previous file
            with xr.open_dataset(files[idx - 1].path,
                                 decode_times=False) as ds:
                freq = ds["time_bnds"].values[0][1] - ds["time_bnds"].values[
                    0][0]
                prev_segment_end = ds["time"].values[-1] + freq
            # now we load the current file to get its first time value
            with xr.open_dataset(files[idx].path, decode_times=False) as ds:
                cur_segment_start = ds["time"].values[0]

            # we assume that the second file is always going to have a LOWER time value
            offset = prev_segment_end - cur_segment_start
            message = f"correct_units={expected_units},offset={offset}"
            if messages_path := os.environ.get("message_file"):
                with open(messages_path, "w") as outstream:
                    outstream.write(message.replace(":", "^"))
            else:
                con_message("error",
                            "could not obtain message_path from environment")
                con_message(
                    "error", message
                )  # no idea if this should be info, warning or error
            return 1
Exemple #26
0
def main():
    desc = """This tool will search through a directory full of raw E3SM model time-slice output files, and find/fix any issues with the time index.
    If overlapping time segments are found, it will find the last file of the preceding segment and truncate it to match the index from the first file from the
    second segment."""
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument(
        "input",
        help="The directory to check for time index issues, should only contain a single time-frequency from a single case"
    )
    parser.add_argument(
        "--output",
        default="output",
        required=False,
        help=f"output directory for rectified dataset, default is {os.environ['PWD']}/output"
    )
    parser.add_argument(
        "--move",
        action="store_true",
        required=False,
        help="move the files from the input directory into the output directory instead of symlinks"
    )
    parser.add_argument(
        "--copy",
        action="store_true",
        required=False,
        help="copy the files from the input directory into the output directory instead of symlinks"
    )
    parser.add_argument(
        "-j",
        "--jobs",
        default=8,
        type=int,
        help="the number of processes, default is 8"
    )
    parser.add_argument(
        "--dryrun",
        action="store_true",
        help="Collect the time segments, but dont produce the truncated files or move anything"
    )
    parser.add_argument(
        "--no-gaps", action="store_true", help="Exit if a time gap is discovered"
    )
    parser.add_argument(
        "-q", "--quiet", action="store_true", help="Suppress progress bars"
    )

    args = parser.parse_args()
    inpath = args.input
    outpath = args.output
    num_jobs = args.jobs
    dryrun = args.dryrun
    quiet = args.quiet

    if args.copy and args.move:
        con_message("error", "Both copy and move flags are set, please only pick one")
        return 1

    if os.path.exists(outpath) and len(os.listdir(outpath)):
        con_message(
            "error", f"Output directory {outpath} already exists and contains files")
        return 1
    else:
        os.makedirs(outpath, exist_ok=True)

    timename, bndsname = get_time_names(next(Path(inpath).glob("*")).as_posix())

    segments = collect_segments(inpath, num_jobs, timename, bndsname)

    if len(segments) == 1:
        con_message("info", "No overlapping segments found")
        if dryrun:
            con_message("info", "not moving files")
        else:
            desc = "Placing files into output directory"
            _, files = segments.popitem()
            for src in tqdm(files, desc=desc, disable=quiet):
                _, name = os.path.split(src)
                dst = os.path.join(outpath, name)
                if os.path.exists(dst):
                    continue
                if args.move:
                    move_file(src, dst)
                elif args.copy:
                    copyfile(src, dst)
                else:
                    os.symlink(src, dst)
        return 0

    ordered_segments = []
    for start, end in segments.keys():
        ordered_segments.append(
            {"start": start, "end": end, "files": segments[(start, end)]}
        )

    ordered_segments.sort(key=lambda i: i["start"])

    for s1, s2 in zip(ordered_segments[:-1], ordered_segments[1:]):
        if s2["start"] > s1["end"]:
            msg = f"There's a time gap between the end of {os.path.basename(s1['files'][-1])} and the start of {os.path.basename(s2['files'][0])} of {s2['start'] - s1['end']} "
            if args.no_gaps == True:
                outpath = Path(outpath)
                if not any(outpath.iterdir()):
                    outpath.rmdir()
                con_message("error", msg)
                sys.exit(1)
            con_message("warning", msg)
            if not args.dryrun:
                con_message("info", "Moving files from the previous segment")
                desc = "Placing files into output directory"
                for src in tqdm(s1["files"], desc=desc, disable=quiet):
                    _, name = os.path.split(src)
                    dst = os.path.join(outpath, name)
                    if os.path.exists(dst):
                        continue
                    if args.move:
                        move_file(src, dst)
                    elif args.copy:
                        copyfile(src, dst)
                    else:
                        os.symlink(src, dst)
                if ordered_segments.index(s2) == len(ordered_segments) - 1:
                    con_message("info", "Moving files from the last segment")
                    desc = "Placing files into output directory"
                    for src in tqdm(s2["files"], desc=desc, disable=quiet):
                        _, name = os.path.split(src)
                        dst = os.path.join(outpath, name)
                        if os.path.exists(dst):
                            continue
                        if args.move:
                            move_file(src, dst)
                        elif args.copy:
                            copyfile(src, dst)
                        else:
                            os.symlink(src, dst)
            continue

        to_truncate = None  # the file that needs to be truncated
        # the index in the file list of segment 1
        truncate_index = len(s1["files"])
        for file in tqdm(s1["files"][::-1], disable=quiet, desc="Stepping backwards to find truncation point"):
            with xr.open_dataset(file, decode_times=False) as ds:
                if ds[bndsname][-1].values[1] > s2["start"]:
                    truncate_index -= 1
                    continue
                else:
                    break

        con_message(
            "info",
            f"removing {len(s1['files']) - truncate_index} files from ({s1['start']}, {s1['end']})",
        )

        new_ds = xr.Dataset()
        to_truncate = s1["files"][truncate_index]
        with xr.open_dataset(to_truncate, decode_times=False) as ds:
            target_index = 0
            for i in range(0, len(ds[bndsname])):
                if ds[bndsname][i].values[1] == s2["start"]:
                    target_index += 1
                    break
                target_index += 1

            con_message(
                "info",
                f"truncating {to_truncate} by removing {len(ds[bndsname]) - target_index} time steps",
            )

            new_ds.attrs = ds.attrs
            for variable in ds.data_vars:
                if "time" not in ds[variable].coords and timename != "Time":
                    new_ds[variable] = ds[variable]
                    new_ds[variable].attrs = ds[variable].attrs
                    continue
                if timename == "time":
                    new_ds[variable] = ds[variable].isel(time=slice(0, target_index))
                    new_ds[variable].attrs = ds[variable].attrs
                else:
                    new_ds[variable] = ds[variable].isel(Time=slice(0, target_index))
                    new_ds[variable].attrs = ds[variable].attrs
                ds[variable].encoding['_FillValue'] = False

        _, to_truncate_name = os.path.split(to_truncate)
        outfile_path = os.path.join(outpath, f"{to_truncate_name[:-3]}.trunc.nc")

        if dryrun:
            con_message("info", f"dryrun, not writing out file {outfile_path}")
        else:
            con_message("info", f"writing out {outfile_path}")
            new_ds.to_netcdf(outfile_path, unlimited_dims=[timename])

        if dryrun:
            con_message("info", "dryrun, not moving files")
        else:
            desc = "Placing files into output directory"
            con_message("info", f"Moving the first {truncate_index} files")
            for src in tqdm(s1["files"][:truncate_index], desc=desc, disable=quiet):
                _, name = os.path.split(src)
                dst = os.path.join(outpath, name)
                if os.path.exists(dst):
                    continue
                if args.move:
                    move_file(src, dst)
                elif args.copy:
                    copyfile(src, dst)
                else:
                    os.symlink(src, dst)
    if dryrun:
        con_message("info", "dryrun, not moving files")
    else:
        con_message("info", "Moving files from the last segment")
        desc = "Placing files into output directory"
        for src in tqdm(ordered_segments[-1]["files"], desc=desc, disable=quiet):
            _, name = os.path.split(src)
            dst = os.path.join(outpath, name)
            if os.path.exists(dst):
                continue
            if args.move:
                move_file(src, dst)
            elif args.copy:
                copyfile(src, dst)
            else:
                os.symlink(src, dst)

    return 0