def _write_nc(ds: xr.Dataset, output_dir: str, output_file: str): output_file = os.path.join(output_dir, output_file) with NamedTemporaryFile() as tmpfile: ds.to_netcdf(tmpfile.name) vcm.get_fs(output_dir).put(tmpfile.name, output_file) logger.info(f"Writing netcdf to {output_file}")
def open_netcdf_dataset(path: str) -> xr.Dataset: """Open a netcdf from a local/remote path""" fs = get_fs(path) data = open_remote_nc(fs, path) return data
def read_last_segment(run_url): fs = vcm.get_fs(run_url) artifacts_dir = os.path.join(run_url, "artifacts") try: segments = sorted(fs.ls(artifacts_dir)) except FileNotFoundError: segments = [] if len(segments) > 0: return vcm.to_url(fs, segments[-1])
def load_final_model_or_checkpoint(train_out_url) -> tf.keras.Model: model_url = os.path.join(train_out_url, "model.tf") checkpoints = os.path.join(train_out_url, "checkpoints", "*.tf") fs = get_fs(train_out_url) if fs.exists(model_url): logger.info(f"Loading model for scoring from: {model_url}") url_to_load = model_url elif fs.glob(checkpoints): url_to_load = sorted(fs.glob(checkpoints))[-1] logger.info( f"Loading last model checkpoint for scoring from: {url_to_load}") else: raise FileNotFoundError(f"No keras models found at {train_out_url}") return tf.keras.models.load_model(url_to_load)
def open_diagnostics_outputs( data_dir, diagnostics_nc_name: str, transect_nc_name: str, metrics_json_name: str, metadata_json_name: str, ) -> Tuple[xr.Dataset, xr.Dataset, dict, dict]: fs = vcm.get_fs(data_dir) with fs.open(os.path.join(data_dir, diagnostics_nc_name), "rb") as f: ds_diags = xr.open_dataset(f).load() transect_full_path = os.path.join(data_dir, transect_nc_name) if fs.exists(transect_full_path): with fs.open(transect_full_path, "rb") as f: ds_transect = xr.open_dataset(f).load() else: ds_transect = xr.Dataset() with fs.open(os.path.join(data_dir, metrics_json_name), "r") as f: metrics = json.load(f) with fs.open(os.path.join(data_dir, metadata_json_name), "r") as f: metadata = json.load(f) return ds_diags, ds_transect, metrics, metadata
def netcdf_url_to_dataset(url: str, variables: Sequence[str], shuffle: bool = False) -> tf.data.Dataset: """Open a url of netcdfs as a tf.data.Dataset of dicts Args: url: points to a directory of netcdf files. variables: a sequence of variable names to load from each netcdf file shuffle: if True, shuffle order the netcdf files will be loaded in. Does not shuffle BETWEEN files. Returns: a tensorflow dataset containing dictionaries of tensors. This dictionary contains all the variables specified in ``variables``. """ fs = vcm.get_fs(url) files = get_nc_files(url, fs) d = tf.data.Dataset.from_tensor_slices(sorted(files)) if shuffle: d = d.shuffle(100_000) return d.map(lambda url: read_variables_as_dict(fs, url, variables))
def get_nc_files(path: str, fs: Optional[fsspec.AbstractFileSystem] = None) -> List[str]: """ Get a list of netCDF files from a remote/local directory Args: path: Local or remote gcs path to netCDF directory fs: Filesystem object to use for the glob operation searching for netCDFs in the path """ if fs is None: fs = get_fs(path) files = list(fs.glob(os.path.join(path, "*.nc"))) # we want to preserve information about the remote protocol # so any downstream operations can glean that info from the paths if "gs" in fs.protocol: files = ["gs://" + f for f in files] return files
ds = xr.open_dataset(path) ds.load() return ds def shuffled(values: Iterable[Any]) -> Tuple[Any, ...]: values = list(values) random.shuffle(values) return tuple(values) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() fs = vcm.get_fs(args.arrays_dir) filenames = sorted(fs.listdir(args.arrays_dir, detail=False)) first_filename = filenames[0] with open(args.config, "rb") as f: config = yaml.safe_load(f) np.random.seed(config.get("random_seed", 0)) random.seed(config.get("random_seed", 0) + 1) tf.random.set_seed(config.get("random_seed", 0) + 2) regularizer = getattr(tf.keras.regularizers, config["regularizer"]["name"])( **config["regularizer"]["kwargs"] ) optimizer_class = getattr(tf.keras.optimizers, config["optimizer"]["name"]) optimizer_kwargs = config["optimizer"]["kwargs"] optimizer = optimizer_class(**optimizer_kwargs)
def read_run_config(run_url): fs = vcm.get_fs(run_url) s = fs.cat(os.path.join(run_url, "fv3config.yml")) return fv3config.load(io.BytesIO(s))