def resolve(cls, uri: str) -> Iterable[str]: """Resolve the dataset URI, and returns a list of parquet files.""" parsed = urlparse(uri) if parsed.scheme in cls._RESOLVERS: logger.debug("Use extended resolver for scheme: %s", parsed.scheme) return cls._RESOLVERS[parsed.scheme].resolve(uri) return cls._RESOLVERS[cls._DEFAULT_SCHEME].resolve(uri)
def validate(self): """Validate model spec Raises ------ SpecError If the spec is not well-formatted. """ logger.debug("Validating spec: %s", self._spec) try: validate(instance=self._spec, schema=MODEL_SPEC_SCHEMA) except ValidationError as e: raise SpecError(e.message) from e
def get_schema(cls, uri: str): """Get the schema of the dataset Parameters ---------- uri : str URI of the dataset """ parsed = urlparse(uri) if parsed.scheme in cls._RESOLVERS: logger.debug("Use extended resolver for scheme: %s", parsed.scheme) return cls._RESOLVERS[parsed.scheme].get_schema(uri) return cls._RESOLVERS[cls._DEFAULT_SCHEME].get_schema(uri)
def copy(source: str, dest: str) -> str: """Copy a file from source to destination, and return the URI of the copied file. Parameters ---------- source : str The source URI to copy from dest : str The destination uri or the destionation directory. If ``dest`` is a URI ends with a "/", it represents a directory. Return ------ str Return the URI of destination. """ source = _normalize_uri(source) dest = _normalize_uri(dest) parsed_source = urlparse(source) if dest and dest.endswith("/"): dest = join(dest, basename(parsed_source.path)) parsed_dest = urlparse(dest) logger.debug("Copying %s to %s", source, dest) if parsed_dest.scheme == parsed_source.scheme: # Direct copy with the same file system if parsed_dest.scheme == "s3": s3fs, source_path = fs.FileSystem.from_uri(source) _, dest_path = fs.FileSystem.from_uri(dest) s3fs.copy(source_path, dest_path) return dest # TODO: find better i/o utilis to copy between filesystems filesystem, dest_path = fs.FileSystem.from_uri(dest) with filesystem.open_output_stream(dest_path) as out_stream: src_fs, src_path = fs.FileSystem.from_uri(source) with src_fs.open_input_stream(src_path) as in_stream: while True: buf = in_stream.read(_BUFSIZE) if not buf: break out_stream.write(buf) return dest
def copy(source: str, dest: str) -> str: """Copy a file from source to destination, and return the URI of the copied file. Parameters ---------- source : str The source URI to copy from dest : str The destination uri or the destination directory. If ``dest`` is a URI ends with a "/", it represents a directory. Return ------ str Return the URI of destination. """ source = _normalize_uri(source) dest = _normalize_uri(dest) parsed_source = urlparse(source) if dest and dest.endswith("/"): dest = join(dest, basename(parsed_source.path)) parsed_dest = urlparse(dest) logger.debug("Copying %s to %s", source, dest) if parsed_dest.scheme == parsed_source.scheme: # Direct copy with the same file system scheme = parsed_dest.scheme if scheme == "s3": s3fs, source_path = fs.FileSystem.from_uri(source) _, dest_path = fs.FileSystem.from_uri(dest) s3fs.copy(source_path, dest_path) return dest elif scheme == "gs": _gcsfs().copy(source, dest) return dest with _open_output_stream(dest) as out_stream, open_input_stream( source ) as in_stream: shutil.copyfileobj(in_stream, out_stream) return dest