def resource_exists(uri: Optional[str]) -> bool: """ Validate that the URI provided points to an existing file. None is a valid option. Parameters ---------- uri: Optional[str] The URI to validate resource existance for. Returns ------- status: bool The validation status. """ if uri is None: return True # TODO Replace after finding way to pass custom fs through FireO validator if uri.startswith("gs://"): return True else: # Get file system fs, uri = url_to_fs(uri) # Check exists if fs.exists(uri): return True return False
def __init__(self, url, **kwargs): # special case Google Cloud Storage, use anonymous access, avoids a delay if url.startswith("gs://") or url.startswith("gcs://"): kwargs.setdefault("token", "anon") # process the url using fsspec pre = kwargs.pop("pre", False) fs, path = url_to_fs(url, **kwargs) self.fs = fs self.path = path # discover which releases are available sub_dirs = [p.split("/")[-1] for p in self.fs.ls(self.path)] releases = [d for d in sub_dirs if d.startswith("v3")] if not pre: releases = [d for d in releases if d in public_releases] if len(releases) == 0: raise ValueError(f"No releases found at location {url!r}") self._releases = releases # setup caches self._cache_sample_sets = dict() self._cache_general_metadata = dict() self._cache_species_calls = dict() self._cache_site_filters = dict() self._cache_snp_sites = None self._cache_snp_genotypes = dict() self._cache_genome = None self._cache_geneset = None
def resource_exists(uri: Optional[str], **kwargs: str) -> bool: """ Validate that the URI provided points to an existing file. None is a valid option. Parameters ---------- uri: Optional[str] The URI to validate resource existance for. Returns ------- status: bool The validation status. """ if uri is None: return True if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"): # Convert to gsutil form if necessary if uri.startswith("https://storage.googleapis"): uri = convert_gcs_json_url_to_gsutil_form(uri) # If uri is not convertible to gsutil form we can't confirm if uri == "": return False if kwargs.get("google_credentials_file"): fs = GCSFileSystem( token=str(kwargs.get("google_credentials_file", "anon"))) return fs.exists(uri) # Can't check GCS resources without creds file else: try: anon_fs = GCSFileSystem(token="anon") return anon_fs.exists(uri) except Exception: return False # Is HTTP remote resource elif uri.startswith("http"): try: # Use HEAD request to check if remote resource exists r = requests.head(uri) return r.status_code == requests.codes.ok except requests.exceptions.SSLError: return False # Get any filesystem and try try: fs, path = url_to_fs(uri) return fs.exists(path) except Exception: return False
def main() -> None: try: args = Args() # Read pipeline config with open(args.event_gather_config_file, "r") as open_resource: config = EventGatherPipelineConfig.from_json( # type: ignore open_resource.read()) log.info("Parsing event details...") # Convert event details file to EventIngestionModel with open(args.event_details_file, "r") as open_resource: ingestion_model = EventIngestionModel.from_json( # type: ignore open_resource.read()) for session in ingestion_model.sessions: # Copy if remote resource, otherwise use local file uri fs, path = url_to_fs(session.video_uri) if not isinstance(fs, LocalFileSystem): # Create tmp directory to save file in dirpath = tempfile.mkdtemp() dst = Path(dirpath) filepath = resource_copy(uri=session.video_uri, dst=dst) else: filepath = session.video_uri # Upload video file to file store log.info(f"Uploading {session.video_uri}...") video_uri = upload_file( credentials_file=config.google_credentials_file, bucket=config.validated_gcs_bucket_name, filepath=filepath, ) # Replace video_uri of session session.video_uri = video_uri # Create event gather pipeline flow log.info("Beginning processing...") flow = pipeline.create_event_gather_flow( config=config, prefetched_events=[ingestion_model], ) # Run flow state = flow.run() if state.is_failed(): raise ValueError("Flow run failed.") except Exception as e: log.error("=============================================") log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def test_open_parquet_file(tmpdir, engine, columns, max_gap, max_block, footer_sample_size): # Pandas required for this test pd = pytest.importorskip("pandas") # Write out a simple DataFrame path = os.path.join(str(tmpdir), "test.parquet") nrows = 40 df = pd.DataFrame( { "x": [i * 7 % 5 for i in range(nrows)], "y": [[0, i] for i in range(nrows)], # list "z": [{ "a": i, "b": "cat" } for i in range(nrows)], # struct }, index=pd.Index([10 * i for i in range(nrows)], name="myindex"), ) df.to_parquet(path) # "Traditional read" (without `open_parquet_file`) expect = pd.read_parquet(path, columns=columns) # Use `_get_parquet_byte_ranges` to re-write a # place-holder file with all bytes NOT required # to read `columns` set to b"0". The purpose of # this step is to make sure the read will fail # if the correct bytes have not been accurately # selected by `_get_parquet_byte_ranges`. If this # test were reading from remote storage, we would # not need this logic to capture errors. fs = url_to_fs(path)[0] data = _get_parquet_byte_ranges( [path], fs, columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, )[path] file_size = fs.size(path) with open(path, "wb") as f: f.write(b"0" * file_size) for (start, stop), byte_data in data.items(): f.seek(start) f.write(byte_data) # Read back the modified file with `open_parquet_file` with open_parquet_file( path, columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, ) as f: result = pd.read_parquet(f, columns=columns) # Check that `result` matches `expect` pd.testing.assert_frame_equal(expect, result)
def resource_copy( uri: str, dst: Optional[Union[str, Path]] = None, overwrite: bool = False, ) -> str: """ Copy a resource (local or remote) to a local destination on the machine. Parameters ---------- uri: str The uri for the resource to copy. dst: Optional[Union[str, Path]] A specific destination to where the copy should be placed. If None provided stores the resource in the current working directory. overwrite: bool Boolean value indicating whether or not to overwrite a local resource with the same name if it already exists. Returns ------- saved_path: str The path of where the resource ended up getting copied to. """ if dst is None: dst = uri.split("/")[-1] # Ensure dst doesn't exist dst = Path(dst).resolve() if dst.is_dir(): dst = dst / uri.split("/")[-1] # Ensure filename is less than 255 chars # Otherwise this can raise an OSError for too long of a filename if len(dst.name) > 255: dst = Path(str(dst)[:255]) # Ensure dest isn't a file if dst.is_file() and not overwrite: raise FileExistsError(dst) # Open requests connection to uri as a stream log.info(f"Beginning resource copy from: {uri}") # Get file system try: kwargs = {} # Set custom timeout for http resources if uri.startswith("http"): kwargs = {"timeout": aiohttp.ClientTimeout(total=1800)} # TODO: Add explicit use of GCS credentials until public read is fixed fs, remote_path = url_to_fs(uri, **kwargs) fs.get(remote_path, str(dst)) log.info(f"Completed resource copy from: {uri}") log.info(f"Stored resource copy: {dst}") return str(dst) except Exception as e: log.error(f"Something went wrong during resource copy. " f"Attempted copy from: '{uri}', resulted in error.") raise e
def get_filesystem(path: _PATH, **kwargs: Any) -> AbstractFileSystem: fs, _ = url_to_fs(str(path), **kwargs) return fs
def test_open_parquet_file(tmpdir, engine, columns, max_gap, max_block, footer_sample_size, range_index): # Pandas required for this test pd = pytest.importorskip("pandas") # Write out a simple DataFrame path = os.path.join(str(tmpdir), "test.parquet") nrows = 40 df = pd.DataFrame( { "x": [i * 7 % 5 for i in range(nrows)], "y": [[0, i] for i in range(nrows)], # list "z": [{ "a": i, "b": "cat" } for i in range(nrows)], # struct }, index=pd.Index([10 * i for i in range(nrows)], name="myindex"), ) if range_index: df = df.reset_index(drop=True) df.index.name = "myindex" df.to_parquet(path) # "Traditional read" (without `open_parquet_file`) expect = pd.read_parquet(path, columns=columns) # Use `_get_parquet_byte_ranges` to re-write a # place-holder file with all bytes NOT required # to read `columns` set to b"0". The purpose of # this step is to make sure the read will fail # if the correct bytes have not been accurately # selected by `_get_parquet_byte_ranges`. If this # test were reading from remote storage, we would # not need this logic to capture errors. fs = url_to_fs(path)[0] data = _get_parquet_byte_ranges( [path], fs, columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, )[path] file_size = fs.size(path) with open(path, "wb") as f: f.write(b"0" * file_size) if footer_sample_size == 8: # We know 8 bytes is too small to include # the footer metadata, so there should NOT # be a key for the last 8 bytes of the file bad_key = (file_size - 8, file_size) assert bad_key not in data.keys() for (start, stop), byte_data in data.items(): f.seek(start) f.write(byte_data) # Read back the modified file with `open_parquet_file` with open_parquet_file( path, columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, ) as f: result = pd.read_parquet(f, columns=columns) # Check that `result` matches `expect` pd.testing.assert_frame_equal(expect, result) # Try passing metadata if engine == "fastparquet": # Should work fine for "fastparquet" pf = fastparquet.ParquetFile(path) with open_parquet_file( path, metadata=pf, columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, ) as f: result = pd.read_parquet(f, columns=columns) pd.testing.assert_frame_equal(expect, result) elif engine == "pyarrow": # Should raise ValueError for "pyarrow" with pytest.raises(ValueError): open_parquet_file( path, metadata=["Not-None"], columns=columns, engine=engine, max_gap=max_gap, max_block=max_block, footer_sample_size=footer_sample_size, )
def _prepare_tifffile( fb: PathOrFileOrBufferLike[AnyStr], *, tifffile_options: dict[str, Any] | None = None, storage_options: dict[str, Any] | None = None, _cls: type[TF] = TiffFile, ) -> TF: """prepare a TiffFile instance Allows providing fsspec urlpaths as well as fsspec OpenFile instances directly. Parameters ---------- fb: an urlpath like string, a fsspec OpenFile like instance or a buffer like instance tifffile_options: keyword arguments passed to tifffile.TiffFile storage_options: keyword arguments passed to fsspec AbstractFileSystem.open() """ tf_kw: dict[str, Any] = tifffile_options or {} st_kw: dict[str, Any] = storage_options or {} def _warn_unused_storage_options(kw: Any) -> None: if kw: warn( "storage_options ignored when providing file or buffer like object", stacklevel=3, ) if isinstance(fb, TiffFileIO): # provided an IO stream like instance _warn_unused_storage_options(st_kw) return _cls(fb, **tf_kw) elif isinstance(fb, OpenFileLike): # provided a fsspec compatible OpenFile instance _warn_unused_storage_options(st_kw) fs, path = fb.fs, fb.path # set name for tifffile.FileHandle if "name" not in tf_kw: if hasattr(fb, "full_name"): name = os.path.basename(fb.full_name) # type: ignore else: name = os.path.basename(path) tf_kw["name"] = name return _cls(fs.open(path), **tf_kw) elif isinstance(fb, (str, os.PathLike)): # provided a string like url urlpath = os.fspath(fb) fs, path = url_to_fs(urlpath, **st_kw) if isinstance(fs, LocalFileSystem): return _cls(path, **tf_kw) else: # set name for tifffile.FileHandle if "name" not in tf_kw: tf_kw["name"] = os.path.basename(path) return _cls(fs.open(path), **tf_kw) else: # let's try anyways ... _warn_unused_storage_options(st_kw) return _cls(fb, **tf_kw)
def main(args): """Mount filesystem from chained URL to MOUNT_POINT. Examples: python3 -m fsspec.fuse memory /usr/share /tmp/mem python3 -m fsspec.fuse local /tmp/source /tmp/local \\ -l /tmp/fsspecfuse.log You can also mount chained-URLs and use special settings: python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\ / /tmp/zip \\ -o 'filecache-cache_storage=/tmp/simplecache' You can specify the type of the setting by using `[int]` or `[bool]`, (`true`, `yes`, `1` represents the Boolean value `True`): python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\ /historic/packages/RPMS /tmp/ftp \\ -o 'simplecache-cache_storage=/tmp/simplecache' \\ -o 'simplecache-check_files=false[bool]' \\ -o 'ftp-listings_expiry_time=60[int]' \\ -o 'ftp-username=anonymous' \\ -o 'ftp-password=xieyanbo' """ class RawDescriptionArgumentParser(argparse.ArgumentParser): def format_help(self): usage = super(RawDescriptionArgumentParser, self).format_help() parts = usage.split("\n\n") parts[1] = self.description.rstrip() return "\n\n".join(parts) parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__) parser.add_argument("--version", action="version", version=__version__) parser.add_argument("url", type=str, help="fs url") parser.add_argument("source_path", type=str, help="source directory in fs") parser.add_argument("mount_point", type=str, help="local directory") parser.add_argument( "-o", "--option", action="append", help="Any options of protocol included in the chained URL", ) parser.add_argument("-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')") parser.add_argument( "-f", "--foreground", action="store_false", help="Running in foreground or not (Default: False)", ) parser.add_argument( "-t", "--threads", action="store_false", help="Running with threads support (Default: False)", ) parser.add_argument( "-r", "--ready-file", action="store_false", help="The `.fuse_ready` file will exist after FUSE is ready. " "(Debugging purpose, Default: False)", ) args = parser.parse_args(args) kwargs = {} for item in args.option or []: key, sep, value = item.partition("=") if not sep: parser.error(message="Wrong option: {!r}".format(item)) val = value.lower() if val.endswith("[int]"): value = int(value[:-len("[int]")]) elif val.endswith("[bool]"): value = val[:-len("[bool]")] in ["1", "yes", "true"] if "-" in key: fs_name, setting_name = key.split("-", 1) if fs_name in kwargs: kwargs[fs_name][setting_name] = value else: kwargs[fs_name] = {setting_name: value} else: kwargs[key] = value if args.log_file: logging.basicConfig( level=logging.DEBUG, filename=args.log_file, format="%(asctime)s %(message)s", ) class LoggingFUSEr(FUSEr, LoggingMixIn): pass fuser = LoggingFUSEr else: fuser = FUSEr fs, url_path = url_to_fs(args.url, **kwargs) logger.debug("Mounting %s to %s", url_path, str(args.mount_point)) run( fs, args.source_path, args.mount_point, foreground=args.foreground, threads=args.threads, ready_file=args.ready_file, ops_class=fuser, )