Beispiel #1
0
 def resolve(cls, uri: str) -> Iterable[str]:
     """Resolve the dataset URI, and returns a list of parquet files."""
     parsed = urlparse(uri)
     if parsed.scheme in cls._RESOLVERS:
         logger.debug("Use extended resolver for scheme: %s", parsed.scheme)
         return cls._RESOLVERS[parsed.scheme].resolve(uri)
     return cls._RESOLVERS[cls._DEFAULT_SCHEME].resolve(uri)
Beispiel #2
0
    def validate(self):
        """Validate model spec

        Raises
        ------
        SpecError
            If the spec is not well-formatted.
        """
        logger.debug("Validating spec: %s", self._spec)
        try:
            validate(instance=self._spec, schema=MODEL_SPEC_SCHEMA)
        except ValidationError as e:
            raise SpecError(e.message) from e
Beispiel #3
0
    def get_schema(cls, uri: str):
        """Get the schema of the dataset

        Parameters
        ----------
        uri : str
            URI of the dataset
        """
        parsed = urlparse(uri)
        if parsed.scheme in cls._RESOLVERS:
            logger.debug("Use extended resolver for scheme: %s", parsed.scheme)
            return cls._RESOLVERS[parsed.scheme].get_schema(uri)
        return cls._RESOLVERS[cls._DEFAULT_SCHEME].get_schema(uri)
Beispiel #4
0
def copy(source: str, dest: str) -> str:
    """Copy a file from source to destination, and return the URI of
    the copied file.

    Parameters
    ----------
    source : str
        The source URI to copy from
    dest : str
        The destination uri or the destionation directory. If ``dest`` is
        a URI ends with a "/", it represents a directory.

    Return
    ------
    str
        Return the URI of destination.
    """
    source = _normalize_uri(source)
    dest = _normalize_uri(dest)
    parsed_source = urlparse(source)
    if dest and dest.endswith("/"):
        dest = join(dest, basename(parsed_source.path))
    parsed_dest = urlparse(dest)
    logger.debug("Copying %s to %s", source, dest)

    if parsed_dest.scheme == parsed_source.scheme:
        # Direct copy with the same file system
        if parsed_dest.scheme == "s3":
            s3fs, source_path = fs.FileSystem.from_uri(source)
            _, dest_path = fs.FileSystem.from_uri(dest)
            s3fs.copy(source_path, dest_path)
            return dest

    # TODO: find better i/o utilis to copy between filesystems
    filesystem, dest_path = fs.FileSystem.from_uri(dest)
    with filesystem.open_output_stream(dest_path) as out_stream:
        src_fs, src_path = fs.FileSystem.from_uri(source)
        with src_fs.open_input_stream(src_path) as in_stream:
            while True:
                buf = in_stream.read(_BUFSIZE)
                if not buf:
                    break
                out_stream.write(buf)
    return dest
Beispiel #5
0
def copy(source: str, dest: str) -> str:
    """Copy a file from source to destination, and return the URI of
    the copied file.

    Parameters
    ----------
    source : str
        The source URI to copy from
    dest : str
        The destination uri or the destination directory. If ``dest`` is
        a URI ends with a "/", it represents a directory.

    Return
    ------
    str
        Return the URI of destination.
    """
    source = _normalize_uri(source)
    dest = _normalize_uri(dest)
    parsed_source = urlparse(source)
    if dest and dest.endswith("/"):
        dest = join(dest, basename(parsed_source.path))
    parsed_dest = urlparse(dest)
    logger.debug("Copying %s to %s", source, dest)

    if parsed_dest.scheme == parsed_source.scheme:
        # Direct copy with the same file system
        scheme = parsed_dest.scheme
        if scheme == "s3":
            s3fs, source_path = fs.FileSystem.from_uri(source)
            _, dest_path = fs.FileSystem.from_uri(dest)
            s3fs.copy(source_path, dest_path)
            return dest
        elif scheme == "gs":
            _gcsfs().copy(source, dest)
            return dest

    with _open_output_stream(dest) as out_stream, open_input_stream(
        source
    ) as in_stream:
        shutil.copyfileobj(in_stream, out_stream)
    return dest