Esempio n. 1
0
def _write_nc(ds: xr.Dataset, output_dir: str, output_file: str):
    output_file = os.path.join(output_dir, output_file)

    with NamedTemporaryFile() as tmpfile:
        ds.to_netcdf(tmpfile.name)
        vcm.get_fs(output_dir).put(tmpfile.name, output_file)
    logger.info(f"Writing netcdf to {output_file}")
Esempio n. 2
0
def open_netcdf_dataset(path: str) -> xr.Dataset:
    """Open a netcdf from a local/remote path"""

    fs = get_fs(path)
    data = open_remote_nc(fs, path)

    return data
Esempio n. 3
0
def read_last_segment(run_url):
    fs = vcm.get_fs(run_url)
    artifacts_dir = os.path.join(run_url, "artifacts")
    try:
        segments = sorted(fs.ls(artifacts_dir))
    except FileNotFoundError:
        segments = []

    if len(segments) > 0:
        return vcm.to_url(fs, segments[-1])
Esempio n. 4
0
def load_final_model_or_checkpoint(train_out_url) -> tf.keras.Model:

    model_url = os.path.join(train_out_url, "model.tf")
    checkpoints = os.path.join(train_out_url, "checkpoints", "*.tf")

    fs = get_fs(train_out_url)
    if fs.exists(model_url):
        logger.info(f"Loading model for scoring from: {model_url}")
        url_to_load = model_url
    elif fs.glob(checkpoints):
        url_to_load = sorted(fs.glob(checkpoints))[-1]
        logger.info(
            f"Loading last model checkpoint for scoring from: {url_to_load}")
    else:
        raise FileNotFoundError(f"No keras models found at {train_out_url}")

    return tf.keras.models.load_model(url_to_load)
Esempio n. 5
0
def open_diagnostics_outputs(
    data_dir,
    diagnostics_nc_name: str,
    transect_nc_name: str,
    metrics_json_name: str,
    metadata_json_name: str,
) -> Tuple[xr.Dataset, xr.Dataset, dict, dict]:
    fs = vcm.get_fs(data_dir)
    with fs.open(os.path.join(data_dir, diagnostics_nc_name), "rb") as f:
        ds_diags = xr.open_dataset(f).load()
    transect_full_path = os.path.join(data_dir, transect_nc_name)
    if fs.exists(transect_full_path):
        with fs.open(transect_full_path, "rb") as f:
            ds_transect = xr.open_dataset(f).load()
    else:
        ds_transect = xr.Dataset()
    with fs.open(os.path.join(data_dir, metrics_json_name), "r") as f:
        metrics = json.load(f)
    with fs.open(os.path.join(data_dir, metadata_json_name), "r") as f:
        metadata = json.load(f)
    return ds_diags, ds_transect, metrics, metadata
Esempio n. 6
0
def netcdf_url_to_dataset(url: str,
                          variables: Sequence[str],
                          shuffle: bool = False) -> tf.data.Dataset:
    """Open a url of netcdfs as a tf.data.Dataset of dicts

    Args:
        url: points to a directory of netcdf files.
        variables: a sequence of variable names to load from each netcdf file
        shuffle: if True, shuffle order the netcdf files will be loaded in. Does
            not shuffle BETWEEN files.
    
    Returns:
        a  tensorflow dataset containing dictionaries of tensors. This
        dictionary contains all the variables specified in ``variables``.
    """
    fs = vcm.get_fs(url)
    files = get_nc_files(url, fs)
    d = tf.data.Dataset.from_tensor_slices(sorted(files))
    if shuffle:
        d = d.shuffle(100_000)
    return d.map(lambda url: read_variables_as_dict(fs, url, variables))
Esempio n. 7
0
def get_nc_files(path: str,
                 fs: Optional[fsspec.AbstractFileSystem] = None) -> List[str]:
    """
    Get a list of netCDF files from a remote/local directory

    Args:
        path: Local or remote gcs path to netCDF directory
        fs: Filesystem object to use for the glob operation
            searching for netCDFs in the path
    """

    if fs is None:
        fs = get_fs(path)

    files = list(fs.glob(os.path.join(path, "*.nc")))

    # we want to preserve information about the remote protocol
    # so any downstream operations can glean that info from the paths
    if "gs" in fs.protocol:
        files = ["gs://" + f for f in files]

    return files
Esempio n. 8
0
    ds = xr.open_dataset(path)
    ds.load()
    return ds


def shuffled(values: Iterable[Any]) -> Tuple[Any, ...]:
    values = list(values)
    random.shuffle(values)
    return tuple(values)


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    fs = vcm.get_fs(args.arrays_dir)
    filenames = sorted(fs.listdir(args.arrays_dir, detail=False))
    first_filename = filenames[0]

    with open(args.config, "rb") as f:
        config = yaml.safe_load(f)
    np.random.seed(config.get("random_seed", 0))
    random.seed(config.get("random_seed", 0) + 1)
    tf.random.set_seed(config.get("random_seed", 0) + 2)

    regularizer = getattr(tf.keras.regularizers, config["regularizer"]["name"])(
        **config["regularizer"]["kwargs"]
    )
    optimizer_class = getattr(tf.keras.optimizers, config["optimizer"]["name"])
    optimizer_kwargs = config["optimizer"]["kwargs"]
    optimizer = optimizer_class(**optimizer_kwargs)
Esempio n. 9
0
def read_run_config(run_url):
    fs = vcm.get_fs(run_url)
    s = fs.cat(os.path.join(run_url, "fv3config.yml"))
    return fv3config.load(io.BytesIO(s))