def log_to_file(fname): """ Will append the given file path to the logger so that stdout and the file will be the output streams for the current logger """ import logging from pathlib import Path as posixpath fname = posixpath(fname) fname.parent.mkdir(exist_ok=True, parents=True) logger = logging.getLogger("fetch_data") # remove existing file handlers for handler in logger.handlers: if isinstance(handler, logging.FileHandler): logger.handlers.remove(handler) # add the new logger with the formatting logFormatter = logging.Formatter("%(asctime)s [%(name)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") fileHandler = logging.FileHandler(fname) fileHandler.setFormatter(logFormatter) logger.addHandler(fileHandler) logging.info("=" * 80) logging.info("Start of logging session")
def reccap2_ocean_masks(url, dest): from pathlib import Path as posixpath import pooch import xarray as xr fname = pooch.retrieve(url, None, posixpath(url).name, dest) ds = xr.open_dataset(fname) return ds
def get_cache_path(url, cache_dir=None): """ Creates the path for the cache used to store remote file names Saves time in updating the """ import hashlib import tempfile from pathlib import Path as posixpath if cache_dir is None: cache_dir = tempfile.gettempdir() cache_fname = hashlib.md5(str(url).encode()).hexdigest() cache_path = posixpath(f"{cache_dir}/{cache_fname}") return cache_path
def _get_southern_ocean_subregions( url='https://github.com/RECCAP2-ocean/shared-resources/raw/master/regions/RECCAP2_region_masks_all.nc', dest='../data/regions/'): import pooch import xarray as xr import pandas as pd from pathlib import Path as posixpath import itertools fname = pooch.retrieve(url, None, posixpath(url).name, dest) ds = xr.open_dataset(fname) mask = ds.southern atlantic = (((mask.lon > 290) | (mask.lon <= 20)) & (mask > 0)).astype(int) * 1 indian = (((mask.lon > 20) & (mask.lon <= 147)) & (mask > 0)).astype(int) * 2 pacific = (((mask.lon > 147) & (mask.lon <= 290)) & (mask > 0)).astype(int) * 3 mask = xr.Dataset() mask['biomes'] = ds.southern.copy() mask['basins'] = (pacific + atlantic + indian).transpose('lat', 'lon') mask['subregions'] = (mask.basins * 3 + mask.biomes - 3).where(lambda a: a > 0).fillna(0).astype(int) basin = ['ATL', 'IND', 'PAC'] biome = ['STSS', 'SPSS', 'ICE'] names = ['-'.join(l) for l in itertools.product(basin, biome)] mask['names'] = xr.DataArray(names, coords={'idx': range(1, 10)}, dims=('idx')) mask['names'].attrs['description'] = 'Names for the subregions' mask['subregions'].attrs['description'] = '(basins * 3 + biomes - 3)' mask['basins'].attrs[ 'description'] = 'Atlantic = 1, Indian = 2, Pacific = 3' mask['biomes'].attrs[ 'description'] = 'Biomes based on Fay and McKinley (2014), STSS=1, SPSS=2, ICE=3' mask.attrs['source'] = url mask.attrs['date'] = pd.Timestamp.today().strftime('%Y-%m-%d') return mask
def create_download_readme(fname, **entry): """ Creates a README file based on the information in the source dictionary. Parameters ---------- name: str name to which file will be written **entry: kwargs must contain """ import inspect from pathlib import Path as posixpath from warnings import warn from .utils import make_readme_file, commong_substring dest = entry.get("dest") # readme will always be overwritten readme_fname = posixpath(f"{dest}/{fname}") readme_fname.parent.mkdir(parents=True, exist_ok=True) url = entry.get("url", None) if isinstance(url, (list, tuple)): url = commong_substring(url) + "..." readme_text = make_readme_file( entry.get("name", ""), url, entry.get("meta", {}), short_info_len_limit=max([120, len(url)]), ) with open(readme_fname, "w") as file: file.write(readme_text)
def log_to_file(fname): import logging from pathlib import Path as posixpath fname = posixpath(fname) fname.parent.mkdir(exist_ok=True, parents=True) rootLogger = logging.getLogger() # remove existing file handlers for handler in rootLogger.handlers: if isinstance(handler, logging.FileHandler): rootLogger.handlers.remove(handler) # add the new logger with the formatting logFormatter = logging.Formatter("%(asctime)s [DOWNLOAD] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") fileHandler = logging.FileHandler(fname) fileHandler.setFormatter(logFormatter) rootLogger.addHandler(fileHandler) logging.info("=" * 80 + "\n" * 2) logging.info("Start of logging session") logging.info("-" * 80)
def create_download_readme(**source_dict): import inspect from pathlib import Path as posixpath import logging dest = source_dict.get("dest").format_map(source_dict) cache_fname = f"{source_dict.get('dest')}/{cache}" manipulation = inspect.cleandoc(f""" Data has been downloaded directly from the server shown in URL. There has been no modification to the original files. There may be a data cache located in the annual subfolders of each with the format {cache_fname.replace('//', '/')} """) args = [ source_dict.get("name", ''), source_dict.get("meta", {}).get("doi", None), source_dict.get("url", None), source_dict.get("meta", {}).get("citation", None), source_dict.get("meta", {}).get("description", None), source_dict.get("variables", []), manipulation, ] readme_fname = posixpath(f"{dest}/{readme}") readme_fname.parent.mkdir(parents=True, exist_ok=True) email = source_dict.get("email", None) logging = source_dict.get("download_logging", "None") readme_text = make_readme_file(*args, email=email, download_logging=logging) with open(readme_fname, "w") as file: file.write(readme_text)
def get_url_list( url, username=None, password=None, use_cache=True, cache_path="./_urls_{hash}.cache", **kwargs, ): """If a url has a wildcard (*) value, remote files will be searched. Leverages off the `fsspec` package. This doesn't work for all HTTP urls. Parameters: url (str): If a url has a wildcard (*) value, remote files will be searched for username (str): if required for given url and protocol (e.g. FTP) password (str): if required for given url and protocol (e.g. FTP) cache_path (str): the path where the cached files will be stored. Has a special case where `{hash}` will be replaced with a hash based on the URL. use_cache (bool): if there is a file with cached remote urls, then those values will be returned as a list Returns: list: a sorted list of urls """ from pathlib import Path as posixpath from urllib.parse import urlparse from .utils import make_hash_string if "*" not in url: return [url] if "{hash}" in cache_path: cache_path = cache_path.format(hash=make_hash_string(url)) if use_cache: cache_path = posixpath(cache_path) if cache_path.is_file(): with open(cache_path, "r") as file: flist = file.read().split("\n") logger.log( 15, f"Fetched {len(flist)} files from flist cache: {cache_path}") return sorted(flist) purl = urlparse(url) protocol = purl.scheme host = purl.netloc path = purl.path logger.log(15, f"Fetching filenames from {url}") props = {"protocol": protocol} if not protocol.startswith("http"): props.update({"host": host}) if username is not None: props["username"] = username if password is not None: props["password"] = password fs = fsspec.filesystem(**props) if protocol.startswith("http"): path = f"{protocol}://{host}/{path}" try: flist = fs.glob(path) except AttributeError: raise FileNotFoundError(f"The given url does not exist: {url}") except TypeError: raise KeyError( f"The host {protocol}://{host} does not accept username/password") if not protocol.startswith("https"): flist = [f"{protocol}://{host}{f}" for f in fs.glob(path)] # writing url list to cache file if use_cache: cache_path.parent.mkdir(exist_ok=True, parents=True) with open(cache_path, "w") as out_file: out_file.write("\n".join(flist)) logger.log(15, f"Cached {len(flist)} urls to: {cache_path}") logging.debug(flist) return sorted(flist)
def get_url_list( url, username=None, password=None, cache_path=None, use_cache=True, raise_on_empty=True, ): """ If a url has a wildcard (*) value, remote files will be searched for. Leverages off the `fsspec` package. This doesnt work for all HTTP urls. Parameters ---------- username: str if required for given url and protocol (e.g. FTP) password: str if required for given url and protocol (e.g. FTP) cache_path: str the path where the cached files will be stored use_cache: bool if there is a file with cached remote urls, then those values will be returned as a list raise_on_empty: bool if there are no files, raise an error or silently pass Returns ------- a sorted list of urls """ from pathlib import Path as posixpath from urllib.parse import urlparse from aiohttp import ClientResponseError from pandas import Series, read_csv if cache_path is None: cache_path = get_cache_path(url) else: cache_path = posixpath(cache_path) if cache_path.is_file() and use_cache: flist = read_csv(str(cache_path), index_col=False).iloc[:, 0].to_list() logging.log( 15, f"Fetched {len(flist)} files from flist cache: {cache_path}") logging.debug(flist) return sorted(flist) purl = urlparse(url) protocol = purl.scheme host = purl.netloc path = purl.path logging.log(15, f"Fetching filenames from {url}") props = {"protocol": protocol} if not protocol.startswith("http"): props.update({"host": host}) if username is not None: props["username"] = username if password is not None: props["password"] = password fs = fsspec.filesystem(**props) if protocol.startswith("http"): path = f"{protocol}://{host}/{path}" try: flist = fs.glob(path) except ClientResponseError: if raise_on_empty: raise ValueError(f"No files could be found for the url: {url}") else: return [] else: flist = [f"{protocol}://{host}{f}" for f in fs.glob(path)] no_files = len(flist) == 0 if no_files and raise_on_empty: raise ValueError(f"No files could be found for the url: {url}") if no_files and not use_cache: return flist cache_path.parent.mkdir(exist_ok=True, parents=True) # writing url list to cache file Series(flist, dtype="str").to_csv(str(cache_path), index=False) logging.log(15, f"Cached {len(flist)} urls to: {cache_path}") logging.debug(flist) return sorted(flist)