def __init__(self, kwargs): backend = SETTINGS.get(f"{self.kind}-plotting-backend", None) backend = kwargs.pop("backend", backend) options = {} options.update(SETTINGS.get("plotting-options", {})) options.update(OPTIONS) options.update(kwargs) self.backend = DRIVERS[backend](Options(options))
def prepare(self, url): o = urlparse(url) assert o.scheme == "ftp" if "@" in o.netloc: auth, server = o.netloc.split("@") user, password = auth.split(":") else: auth, server = None, o.netloc user, password = "******", "anonymous" ftp = FTP( server, timeout=SETTINGS.get("url-download-timeout"), ) if auth: ftp.login(user, password) else: ftp.login() ftp.cwd(os.path.dirname(o.path)) ftp.set_pasv(True) self.filename = os.path.basename(o.path) self.ftp = ftp return ftp.size(self.filename)
def connection(self): if self._connection is None: cache_dir = SETTINGS.get("cache-directory") if not os.path.exists(cache_dir): os.makedirs(cache_dir, exist_ok=True) cache_db = os.path.join(cache_dir, CACHE_DB) LOG.debug("Cache database is %s", cache_db) self._connection = sqlite3.connect(cache_db) # So we can use rows as dictionaries self._connection.row_factory = sqlite3.Row # If you change the schema, change VERSION above self._connection.execute( """ CREATE TABLE IF NOT EXISTS cache ( path TEXT PRIMARY KEY, owner TEXT NOT NULL, args TEXT NOT NULL, creation_date TEXT NOT NULL, flags INTEGER DEFAULT 0, owner_data TEXT, last_access TEXT NOT NULL, type TEXT, parent TEXT, replaced TEXT, extra TEXT, expires INTEGER, accesses INTEGER, size INTEGER);""" ) return self._connection
def out_of_date(self, url, path, cache_data): if SETTINGS.get("check-out-of-date-urls") is False: return False if self.downloader.out_of_date(path, cache_data): if SETTINGS.get( "download-out-of-date-urls") or self.update_if_out_of_date: LOG.warning( "Invalidating cache version and re-downloading %s", self.url, ) return True else: LOG.warning( "To enable automatic downloading of updated URLs set the 'download-out-of-date-urls'" " setting to True", ) return False
def _check_cache_size(self): # Check absolute limit size = self._cache_size() maximum = SETTINGS.get("maximum-cache-size") if maximum is not None and size > maximum: self._housekeeping() self._decache(size - maximum) # Check relative limit size = self._cache_size() usage = SETTINGS.get("maximum-cache-disk-usage") cache_directory = SETTINGS.get("cache-directory") df = psutil.disk_usage(cache_directory) if df.percent > usage: LOG.debug("Cache disk usage %s, limit %s", df.percent, usage) self._housekeeping() delta = (df.percent - usage) * df.total * 0.01 self._decache(delta)
def __repr__(self): cache_dir = SETTINGS.get("cache-directory") path = getattr(self, "path", None) if isinstance(path, str): path = path.replace(cache_dir, "CACHE:") try: reader_class_name = str(self._reader.__class__.__name__) except AttributeError as e: reader_class_name = str(e) except: # noqa: E722 reader_class_name = "Unknown" return f"{self.__class__.__name__}({path},{reader_class_name})"
def _housekeeping(self, clean=False): top = SETTINGS.get("cache-directory") with self.connection as db: for name in os.listdir(top): if name == CACHE_DB: continue full = os.path.join(top, name) count = db.execute("SELECT count(*) FROM cache WHERE path=?", (full, )).fetchone()[0] if count > 0: continue parent = None start = full.split(".")[0] + "%" for n in db.execute( "SELECT path FROM cache WHERE parent IS NULL and path LIKE ?", (start, ), ).fetchall(): if full.startswith(n["path"]): parent = n["path"] break try: s = os.stat(full) if time.time() - s.st_mtime < 120: # Two minutes continue except OSError: pass if parent is None: LOG.warning(f"CliMetLab cache: orphan found: {full}") else: LOG.debug( f"CliMetLab cache: orphan found: {full} with parent {parent}" ) self._register_cache_file( full, "orphans", None, parent, ) self._update_cache(clean=clean)
def prepare(self, url): size = None headers = self.headers(url) if "content-length" in headers: try: size = int(headers["content-length"]) except Exception: LOG.exception("content-length %s", url) r = requests.get( url, stream=True, verify=self.owner.verify, timeout=SETTINGS.get("url-download-timeout"), headers=self.owner.http_headers, ) r.raise_for_status() self.request = r return size
def out_of_date(self, url, path, cache_data): if cache_data is not None: # TODO: check 'cache-control' to see if we should check the etag if "cache-control" in cache_data: pass if "expires" in cache_data: try: expires = parse_date(cache_data["expires"]) now = pytz.UTC.localize(datetime.datetime.utcnow()) if expires > now: LOG.debug("URL %s not expired (%s > %s)", url, expires, now) return False except Exception: LOG.exception("Failed to check URL expiry date '%s'", cache_data["expires"]) headers = self.headers(url) cached_etag = cache_data.get("etag") remote_etag = headers.get("etag") if cached_etag != remote_etag: LOG.warning("Remote content of URL %s has changed", url) if (SETTINGS.get("download-updated-urls") or self.owner.update_if_out_of_date): LOG.warning( "Invalidating cache version and re-downloading %s", url) return True LOG.warning( "To enable automatic downloading of updated URLs set the 'download-updated-urls' setting to True", ) else: LOG.debug("Remote content of URL %s unchanged", url) return False
def __repr__(self): cache_dir = SETTINGS.get("cache-directory") path = self.path.replace(cache_dir, "CACHE:") return f"{self.__class__.__name__}({path},{self._reader.__class__.__name__})"
def __init__(self, kwargs): options = {} options.update(SETTINGS.get("plotting-options", {})) options.update(OPTIONS) options.update(kwargs) self.driver = Driver(Options(options))
def __init__( self, url, parts=None, filter=None, merger=None, verify=True, force=None, chunk_size=1024 * 1024, range_method="auto", http_headers=None, update_if_out_of_date=False, fake_headers=None, # When HEAD is not allowed but you know the size ): super().__init__(filter=filter, merger=merger) # TODO: re-enable this feature extension = None self.url = url self.parts = parts LOG.debug("URL %s", url) self.update_if_out_of_date = update_if_out_of_date self.downloader = Downloader( url, chunk_size=chunk_size, timeout=SETTINGS.get("url-download-timeout"), verify=verify, parts=parts, range_method=range_method, http_headers=http_headers, fake_headers=fake_headers, statistics_gatherer=record_statistics, progress_bar=progress_bar, resume_transfers=True, override_target_file=False, download_file_extension=".download", ) if extension and extension[0] != ".": extension = "." + extension if extension is None: extension = self.downloader.extension() self.path = self.downloader.local_path() if self.path is not None: return if force is None: force = self.out_of_date def download(target, _): self.downloader.download(target) return self.downloader.cache_data() self.path = self.cache_file( download, dict(url=url, parts=parts), extension=extension, force=force, )
def cache_file( owner: str, create, args, hash_extra=None, extension: str = ".cache", force=None, replace=None, ): """Creates a cache file in the climetlab cache-directory (defined in the :py:class:`Settings`). Uses :py:func:`_register_cache_file()` Parameters ---------- owner : str The owner of the cache file is generally the name of the source that generated the cache. extension : str, optional Extension filename (such as ".nc" for NetCDF, etc.), by default ".cache" Returns ------- path : str Full path to the cache file. """ m = hashlib.sha256() m.update(owner.encode("utf-8")) m.update(json.dumps(args, sort_keys=True).encode("utf-8")) m.update(json.dumps(hash_extra, sort_keys=True).encode("utf-8")) m.update(json.dumps(extension, sort_keys=True).encode("utf-8")) if replace is not None: # Don't replace files that are not in the cache if not file_in_cache_directory(replace): replace = None path = os.path.join( SETTINGS.get("cache-directory"), "{}-{}{}".format( owner.lower(), m.hexdigest(), extension, ), ) record = register_cache_file(path, owner, args) if os.path.exists(path): if callable(force): owner_data = record["owner_data"] if owner_data is not None: owner_data = json.loads(owner_data) force = force(args, path, owner_data) if force: decache_file(path) if not os.path.exists(path): tmp = ".{}-{}.tmp".format(os.getpid(), threading.get_ident()) owner_data = create(path + tmp, args) os.rename(path + tmp, path) update_entry(path, owner_data) check_cache_size() return path
def _cache_directory(self): cache_directory = SETTINGS.get("cache-directory") return cache_directory
def _file_in_cache_directory(self, path): cache_directory = SETTINGS.get("cache-directory") return path.startswith(cache_directory)
def settings(self, name): return SETTINGS.get(name)
def __repr__(self): cache_dir = SETTINGS.get("cache-directory") path = getattr(self, "path", None) if isinstance(path, str): path = path.replace(cache_dir, "CACHE:") return f"{self.__class__.__name__}({path})"