def save_url_infos( path: epath.Path, url_infos: Dict[str, UrlInfo], ) -> None: """Store given checksums and sizes for specific dataset. Content of file is never disgarded, only updated. This is to ensure that if process is killed right after first download finishes, checksums registered during previous runs aren't lost. It is the responsibility of the caller not to call function multiple times in parallel for a given dataset. Only original file content is updated. This means the entire set of new sizes and checksums must be given at every call. Args: path: Path to the resources. url_infos: dict, {url: (size_in_bytes, checksum)}. """ original_data = load_url_infos(path) if path.exists() else {} new_data = original_data.copy() new_data.update(url_infos) # Compare filenames separatelly, as filename field is eq=False if original_data == new_data and _filenames_equal(original_data, new_data): return lines = [ f'{url}\t{int(url_info.size)}\t{url_info.checksum}\t' f'{url_info.filename or ""}\n' for url, url_info in sorted(new_data.items()) ] path.write_text(''.join(lines), encoding='UTF-8')
def _get_cached_copy(file_path: epath.Path, max_age_days: int) -> Optional[str]: if file_path.exists(): stats = os.stat(file_path) modified_time = datetime.datetime.fromtimestamp(stats.st_mtime) if modified_time > datetime.datetime.now() - datetime.timedelta( days=max_age_days): return file_path.read_text() return None
def list_ds_packages_for_namespace( namespace: str, path: epath.Path, ) -> List[DatasetPackage]: """Returns the dataset names found in a specific directory. Directories that contain code should have the following structure: ``` <path>/ <dataset0>/ <dataset0>.py <dataset1>/ <dataset1>.py ... ``` Additional files or folders which are not detected as datasets will be ignored (e.g. `__init__.py`). Args: namespace: Namespace of the datasets path: The directory path containing the datasets. Returns: ds_packages: The dataset packages found in the directory (sorted for determinism). Raises: FileNotFoundError: If the path cannot be reached. """ if not path.exists(): # Should be fault-tolerant in the future raise FileNotFoundError(f'Could not find datasets at {path}') all_packages = [] for ds_path in path.iterdir(): source = get_dataset_source(ds_path) if source: pkg = DatasetPackage( name=naming.DatasetName(namespace=namespace, name=ds_path.name), source=source, ) all_packages.append(pkg) return all_packages
def exists(path: epath.Path) -> bool: """Checks if path exists. Returns False if issues occur connecting to GCS.""" try: return path.exists() except GCS_UNAVAILABLE_EXCEPTIONS: # pylint: disable=catching-non-exception return False