Exemple #1
0
def resource_exists(uri: Optional[str]) -> bool:
    """
    Validate that the URI provided points to an existing file.

    None is a valid option.

    Parameters
    ----------
    uri: Optional[str]
        The URI to validate resource existance for.

    Returns
    -------
    status: bool
        The validation status.
    """

    if uri is None:
        return True

    # TODO Replace after finding way to pass custom fs through FireO validator
    if uri.startswith("gs://"):
        return True

    else:
        # Get file system
        fs, uri = url_to_fs(uri)

        # Check exists
        if fs.exists(uri):
            return True

    return False
    def __init__(self, url, **kwargs):

        # special case Google Cloud Storage, use anonymous access, avoids a delay
        if url.startswith("gs://") or url.startswith("gcs://"):
            kwargs.setdefault("token", "anon")

        # process the url using fsspec
        pre = kwargs.pop("pre", False)
        fs, path = url_to_fs(url, **kwargs)
        self.fs = fs
        self.path = path

        # discover which releases are available
        sub_dirs = [p.split("/")[-1] for p in self.fs.ls(self.path)]
        releases = [d for d in sub_dirs if d.startswith("v3")]
        if not pre:
            releases = [d for d in releases if d in public_releases]
        if len(releases) == 0:
            raise ValueError(f"No releases found at location {url!r}")
        self._releases = releases

        # setup caches
        self._cache_sample_sets = dict()
        self._cache_general_metadata = dict()
        self._cache_species_calls = dict()
        self._cache_site_filters = dict()
        self._cache_snp_sites = None
        self._cache_snp_genotypes = dict()
        self._cache_genome = None
        self._cache_geneset = None
Exemple #3
0
def resource_exists(uri: Optional[str], **kwargs: str) -> bool:
    """
    Validate that the URI provided points to an existing file.

    None is a valid option.

    Parameters
    ----------
    uri: Optional[str]
        The URI to validate resource existance for.

    Returns
    -------
    status: bool
        The validation status.
    """

    if uri is None:
        return True

    if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"):
        # Convert to gsutil form if necessary
        if uri.startswith("https://storage.googleapis"):
            uri = convert_gcs_json_url_to_gsutil_form(uri)

            # If uri is not convertible to gsutil form we can't confirm
            if uri == "":
                return False

        if kwargs.get("google_credentials_file"):
            fs = GCSFileSystem(
                token=str(kwargs.get("google_credentials_file", "anon")))
            return fs.exists(uri)

        # Can't check GCS resources without creds file
        else:
            try:
                anon_fs = GCSFileSystem(token="anon")
                return anon_fs.exists(uri)
            except Exception:
                return False

    # Is HTTP remote resource
    elif uri.startswith("http"):
        try:
            # Use HEAD request to check if remote resource exists
            r = requests.head(uri)

            return r.status_code == requests.codes.ok
        except requests.exceptions.SSLError:
            return False

    # Get any filesystem and try
    try:
        fs, path = url_to_fs(uri)
        return fs.exists(path)
    except Exception:
        return False
def main() -> None:
    try:
        args = Args()

        # Read pipeline config
        with open(args.event_gather_config_file, "r") as open_resource:
            config = EventGatherPipelineConfig.from_json(  # type: ignore
                open_resource.read())

        log.info("Parsing event details...")
        # Convert event details file to EventIngestionModel
        with open(args.event_details_file, "r") as open_resource:
            ingestion_model = EventIngestionModel.from_json(  # type: ignore
                open_resource.read())

            for session in ingestion_model.sessions:
                # Copy if remote resource, otherwise use local file uri
                fs, path = url_to_fs(session.video_uri)
                if not isinstance(fs, LocalFileSystem):
                    # Create tmp directory to save file in
                    dirpath = tempfile.mkdtemp()
                    dst = Path(dirpath)

                    filepath = resource_copy(uri=session.video_uri, dst=dst)
                else:
                    filepath = session.video_uri

                # Upload video file to file store
                log.info(f"Uploading {session.video_uri}...")
                video_uri = upload_file(
                    credentials_file=config.google_credentials_file,
                    bucket=config.validated_gcs_bucket_name,
                    filepath=filepath,
                )

                # Replace video_uri of session
                session.video_uri = video_uri

        # Create event gather pipeline flow
        log.info("Beginning processing...")
        flow = pipeline.create_event_gather_flow(
            config=config,
            prefetched_events=[ingestion_model],
        )

        # Run flow
        state = flow.run()
        if state.is_failed():
            raise ValueError("Flow run failed.")

    except Exception as e:
        log.error("=============================================")
        log.error("\n\n" + traceback.format_exc())
        log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
Exemple #5
0
def test_open_parquet_file(tmpdir, engine, columns, max_gap, max_block,
                           footer_sample_size):

    # Pandas required for this test
    pd = pytest.importorskip("pandas")

    # Write out a simple DataFrame
    path = os.path.join(str(tmpdir), "test.parquet")
    nrows = 40
    df = pd.DataFrame(
        {
            "x": [i * 7 % 5 for i in range(nrows)],
            "y": [[0, i] for i in range(nrows)],  # list
            "z": [{
                "a": i,
                "b": "cat"
            } for i in range(nrows)],  # struct
        },
        index=pd.Index([10 * i for i in range(nrows)], name="myindex"),
    )
    df.to_parquet(path)

    # "Traditional read" (without `open_parquet_file`)
    expect = pd.read_parquet(path, columns=columns)

    # Use `_get_parquet_byte_ranges` to re-write a
    # place-holder file with all bytes NOT required
    # to read `columns` set to b"0". The purpose of
    # this step is to make sure the read will fail
    # if the correct bytes have not been accurately
    # selected by `_get_parquet_byte_ranges`. If this
    # test were reading from remote storage, we would
    # not need this logic to capture errors.
    fs = url_to_fs(path)[0]
    data = _get_parquet_byte_ranges(
        [path],
        fs,
        columns=columns,
        engine=engine,
        max_gap=max_gap,
        max_block=max_block,
        footer_sample_size=footer_sample_size,
    )[path]
    file_size = fs.size(path)
    with open(path, "wb") as f:
        f.write(b"0" * file_size)
        for (start, stop), byte_data in data.items():
            f.seek(start)
            f.write(byte_data)

    # Read back the modified file with `open_parquet_file`
    with open_parquet_file(
            path,
            columns=columns,
            engine=engine,
            max_gap=max_gap,
            max_block=max_block,
            footer_sample_size=footer_sample_size,
    ) as f:
        result = pd.read_parquet(f, columns=columns)

    # Check that `result` matches `expect`
    pd.testing.assert_frame_equal(expect, result)
Exemple #6
0
def resource_copy(
    uri: str,
    dst: Optional[Union[str, Path]] = None,
    overwrite: bool = False,
) -> str:
    """
    Copy a resource (local or remote) to a local destination on the machine.

    Parameters
    ----------
    uri: str
        The uri for the resource to copy.
    dst: Optional[Union[str, Path]]
        A specific destination to where the copy should be placed. If None provided
        stores the resource in the current working directory.
    overwrite: bool
        Boolean value indicating whether or not to overwrite a local resource with
        the same name if it already exists.

    Returns
    -------
    saved_path: str
        The path of where the resource ended up getting copied to.
    """
    if dst is None:
        dst = uri.split("/")[-1]

    # Ensure dst doesn't exist
    dst = Path(dst).resolve()
    if dst.is_dir():
        dst = dst / uri.split("/")[-1]

    # Ensure filename is less than 255 chars
    # Otherwise this can raise an OSError for too long of a filename
    if len(dst.name) > 255:
        dst = Path(str(dst)[:255])

    # Ensure dest isn't a file
    if dst.is_file() and not overwrite:
        raise FileExistsError(dst)

    # Open requests connection to uri as a stream
    log.info(f"Beginning resource copy from: {uri}")
    # Get file system
    try:
        kwargs = {}

        # Set custom timeout for http resources
        if uri.startswith("http"):
            kwargs = {"timeout": aiohttp.ClientTimeout(total=1800)}

        # TODO: Add explicit use of GCS credentials until public read is fixed
        fs, remote_path = url_to_fs(uri, **kwargs)
        fs.get(remote_path, str(dst))
        log.info(f"Completed resource copy from: {uri}")
        log.info(f"Stored resource copy: {dst}")

        return str(dst)
    except Exception as e:
        log.error(f"Something went wrong during resource copy. "
                  f"Attempted copy from: '{uri}', resulted in error.")
        raise e
Exemple #7
0
def get_filesystem(path: _PATH, **kwargs: Any) -> AbstractFileSystem:
    fs, _ = url_to_fs(str(path), **kwargs)
    return fs
def test_open_parquet_file(tmpdir, engine, columns, max_gap, max_block,
                           footer_sample_size, range_index):

    # Pandas required for this test
    pd = pytest.importorskip("pandas")

    # Write out a simple DataFrame
    path = os.path.join(str(tmpdir), "test.parquet")
    nrows = 40
    df = pd.DataFrame(
        {
            "x": [i * 7 % 5 for i in range(nrows)],
            "y": [[0, i] for i in range(nrows)],  # list
            "z": [{
                "a": i,
                "b": "cat"
            } for i in range(nrows)],  # struct
        },
        index=pd.Index([10 * i for i in range(nrows)], name="myindex"),
    )
    if range_index:
        df = df.reset_index(drop=True)
        df.index.name = "myindex"
    df.to_parquet(path)

    # "Traditional read" (without `open_parquet_file`)
    expect = pd.read_parquet(path, columns=columns)

    # Use `_get_parquet_byte_ranges` to re-write a
    # place-holder file with all bytes NOT required
    # to read `columns` set to b"0". The purpose of
    # this step is to make sure the read will fail
    # if the correct bytes have not been accurately
    # selected by `_get_parquet_byte_ranges`. If this
    # test were reading from remote storage, we would
    # not need this logic to capture errors.
    fs = url_to_fs(path)[0]
    data = _get_parquet_byte_ranges(
        [path],
        fs,
        columns=columns,
        engine=engine,
        max_gap=max_gap,
        max_block=max_block,
        footer_sample_size=footer_sample_size,
    )[path]
    file_size = fs.size(path)
    with open(path, "wb") as f:
        f.write(b"0" * file_size)

        if footer_sample_size == 8:
            # We know 8 bytes is too small to include
            # the footer metadata, so there should NOT
            # be a key for the last 8 bytes of the file
            bad_key = (file_size - 8, file_size)
            assert bad_key not in data.keys()

        for (start, stop), byte_data in data.items():
            f.seek(start)
            f.write(byte_data)

    # Read back the modified file with `open_parquet_file`
    with open_parquet_file(
            path,
            columns=columns,
            engine=engine,
            max_gap=max_gap,
            max_block=max_block,
            footer_sample_size=footer_sample_size,
    ) as f:
        result = pd.read_parquet(f, columns=columns)

    # Check that `result` matches `expect`
    pd.testing.assert_frame_equal(expect, result)

    # Try passing metadata
    if engine == "fastparquet":
        # Should work fine for "fastparquet"
        pf = fastparquet.ParquetFile(path)
        with open_parquet_file(
                path,
                metadata=pf,
                columns=columns,
                engine=engine,
                max_gap=max_gap,
                max_block=max_block,
                footer_sample_size=footer_sample_size,
        ) as f:
            result = pd.read_parquet(f, columns=columns)
        pd.testing.assert_frame_equal(expect, result)
    elif engine == "pyarrow":
        # Should raise ValueError for "pyarrow"
        with pytest.raises(ValueError):
            open_parquet_file(
                path,
                metadata=["Not-None"],
                columns=columns,
                engine=engine,
                max_gap=max_gap,
                max_block=max_block,
                footer_sample_size=footer_sample_size,
            )
Exemple #9
0
def _prepare_tifffile(
    fb: PathOrFileOrBufferLike[AnyStr],
    *,
    tifffile_options: dict[str, Any] | None = None,
    storage_options: dict[str, Any] | None = None,
    _cls: type[TF] = TiffFile,
) -> TF:
    """prepare a TiffFile instance

    Allows providing fsspec urlpaths as well as fsspec OpenFile instances directly.

    Parameters
    ----------
    fb:
        an urlpath like string, a fsspec OpenFile like instance or a buffer like instance
    tifffile_options:
        keyword arguments passed to tifffile.TiffFile
    storage_options:
        keyword arguments passed to fsspec AbstractFileSystem.open()
    """
    tf_kw: dict[str, Any] = tifffile_options or {}
    st_kw: dict[str, Any] = storage_options or {}

    def _warn_unused_storage_options(kw: Any) -> None:
        if kw:
            warn(
                "storage_options ignored when providing file or buffer like object",
                stacklevel=3,
            )

    if isinstance(fb, TiffFileIO):
        # provided an IO stream like instance
        _warn_unused_storage_options(st_kw)

        return _cls(fb, **tf_kw)

    elif isinstance(fb, OpenFileLike):
        # provided a fsspec compatible OpenFile instance
        _warn_unused_storage_options(st_kw)

        fs, path = fb.fs, fb.path

        # set name for tifffile.FileHandle
        if "name" not in tf_kw:
            if hasattr(fb, "full_name"):
                name = os.path.basename(fb.full_name)  # type: ignore
            else:
                name = os.path.basename(path)
            tf_kw["name"] = name

        return _cls(fs.open(path), **tf_kw)

    elif isinstance(fb, (str, os.PathLike)):
        # provided a string like url
        urlpath = os.fspath(fb)
        fs, path = url_to_fs(urlpath, **st_kw)
        if isinstance(fs, LocalFileSystem):
            return _cls(path, **tf_kw)
        else:
            # set name for tifffile.FileHandle
            if "name" not in tf_kw:
                tf_kw["name"] = os.path.basename(path)

            return _cls(fs.open(path), **tf_kw)

    else:
        # let's try anyways ...
        _warn_unused_storage_options(st_kw)

        return _cls(fb, **tf_kw)
Exemple #10
0
def main(args):
    """Mount filesystem from chained URL to MOUNT_POINT.

    Examples:

    python3 -m fsspec.fuse memory /usr/share /tmp/mem

    python3 -m fsspec.fuse local /tmp/source /tmp/local \\
            -l /tmp/fsspecfuse.log

    You can also mount chained-URLs and use special settings:

    python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
            / /tmp/zip \\
            -o 'filecache-cache_storage=/tmp/simplecache'

    You can specify the type of the setting by using `[int]` or `[bool]`,
    (`true`, `yes`, `1` represents the Boolean value `True`):

    python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
            /historic/packages/RPMS /tmp/ftp \\
            -o 'simplecache-cache_storage=/tmp/simplecache' \\
            -o 'simplecache-check_files=false[bool]' \\
            -o 'ftp-listings_expiry_time=60[int]' \\
            -o 'ftp-username=anonymous' \\
            -o 'ftp-password=xieyanbo'
    """
    class RawDescriptionArgumentParser(argparse.ArgumentParser):
        def format_help(self):
            usage = super(RawDescriptionArgumentParser, self).format_help()
            parts = usage.split("\n\n")
            parts[1] = self.description.rstrip()
            return "\n\n".join(parts)

    parser = RawDescriptionArgumentParser(prog="fsspec.fuse",
                                          description=main.__doc__)
    parser.add_argument("--version", action="version", version=__version__)
    parser.add_argument("url", type=str, help="fs url")
    parser.add_argument("source_path", type=str, help="source directory in fs")
    parser.add_argument("mount_point", type=str, help="local directory")
    parser.add_argument(
        "-o",
        "--option",
        action="append",
        help="Any options of protocol included in the chained URL",
    )
    parser.add_argument("-l",
                        "--log-file",
                        type=str,
                        help="Logging FUSE debug info (Default: '')")
    parser.add_argument(
        "-f",
        "--foreground",
        action="store_false",
        help="Running in foreground or not (Default: False)",
    )
    parser.add_argument(
        "-t",
        "--threads",
        action="store_false",
        help="Running with threads support (Default: False)",
    )
    parser.add_argument(
        "-r",
        "--ready-file",
        action="store_false",
        help="The `.fuse_ready` file will exist after FUSE is ready. "
        "(Debugging purpose, Default: False)",
    )
    args = parser.parse_args(args)

    kwargs = {}
    for item in args.option or []:
        key, sep, value = item.partition("=")
        if not sep:
            parser.error(message="Wrong option: {!r}".format(item))
        val = value.lower()
        if val.endswith("[int]"):
            value = int(value[:-len("[int]")])
        elif val.endswith("[bool]"):
            value = val[:-len("[bool]")] in ["1", "yes", "true"]

        if "-" in key:
            fs_name, setting_name = key.split("-", 1)
            if fs_name in kwargs:
                kwargs[fs_name][setting_name] = value
            else:
                kwargs[fs_name] = {setting_name: value}
        else:
            kwargs[key] = value

    if args.log_file:
        logging.basicConfig(
            level=logging.DEBUG,
            filename=args.log_file,
            format="%(asctime)s %(message)s",
        )

        class LoggingFUSEr(FUSEr, LoggingMixIn):
            pass

        fuser = LoggingFUSEr
    else:
        fuser = FUSEr

    fs, url_path = url_to_fs(args.url, **kwargs)
    logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
    run(
        fs,
        args.source_path,
        args.mount_point,
        foreground=args.foreground,
        threads=args.threads,
        ready_file=args.ready_file,
        ops_class=fuser,
    )