def copy(source, dest, filesystem_from=None, filesystem_to=None): """ Copy a file from the source to the destination file system :param source: (str) urlpath of the file to copy :param dest: (str) urlpath of the folder where to save the file :param filesystem_from: (`fsspec` compatible file system instance) :param filesystem_to: (`fsspec` compatible file system instance.) :return (str) urlpath of the copied file """ _, filename = os.path.split(source) target = os.path.join(dest, filename) filesystem_from = filesystem_from or \ fsspec.filesystem(split_protocol(source)[0]) filesystem_to = filesystem_to or \ fsspec.filesystem(split_protocol(dest)[0]) with filesystem_from.open(source, "rb") as f_read: filesystem_to.makedirs(dest, exist_ok=True) with filesystem_to.open(target, "wb") as f_write: if isinstance(filesystem_to, dcachefs.dCacheFileSystem): f_write.write(f_read) # stream upload of file-like object else: data = True while data: data = f_read.read(CHUNKSIZE) f_write.write(data) return target
def upload_output_directory(url): if url is None: yield None, None return protocol, _ = split_protocol(url) if protocol is not None: # To avoid extra network load, write all output files locally at runtime, # then upload to the remote fs at the end. with tempfile.TemporaryDirectory() as tmpdir: fs, remote_path = get_fs_and_path(url) if path_exists(url): fs.get(url, tmpdir + "/", recursive=True) def put_fn(): fs.put(tmpdir, remote_path, recursive=True) # Write to temp directory locally yield tmpdir, put_fn # Upload to remote when finished put_fn() else: makedirs(url, exist_ok=True) # Just use the output directory directly if using a local filesystem yield url, None
def read_bytes_collection( vineyard_socket, prefix, storage_options, proc_num, proc_index ): """Read a set of files as a collection of ByteStreams.""" client = vineyard.connect(vineyard_socket) protocol, prefix_path = split_protocol(prefix) fs = fsspec.filesystem(protocol, **storage_options) worker_prefix = os.path.join(prefix_path, '%s-%s' % (proc_num, proc_index)) logger.info("start creating blobs ...") queue: "ConcurrentQueue[Tuple[ByteStream, str]]" = ConcurrentQueue() stream_id = read_stream_collections(client, fs, queue, worker_prefix, worker_prefix) client.persist(stream_id) report_success(stream_id) logger.info("start reading blobs ...") executor = ThreadStreamExecutor( ReadToByteStreamExecutor, parallism=1, client=client, fs=fs, task_queue=queue, chunk_size=CHUNK_SIZE, ) executor.execute()
def rename(src, tgt): protocol, _ = split_protocol(tgt) if protocol is not None: fs = fsspec.filesystem(protocol) fs.mv(src, tgt, recursive=True) else: safe_move_file(src, tgt)
def __init__(self, path=None, **storage_options): from fsspec import filesystem from fsspec.core import split_protocol self.pdir = make_path_posix(path or conf.get('persist_path')) protocol, _ = split_protocol(self.pdir) path = posixpath.join(self.pdir, 'cat.yaml') self.fs = filesystem(protocol, **storage_options) super(PersistStore, self).__init__(path)
def get_fs_and_path(url): protocol, path = split_protocol(url) # Parse the url to get only the escaped url path path = unquote(urlparse(path).path) # Create a windows compatible path from url path path = os.fspath(pathlib.PurePosixPath(path)) fs = fsspec.filesystem(protocol) return fs, path
def _get_fs_and_protocol(self): storage_options = self.storage_options or {} protocol, path = split_protocol(self.prefix_path) cls = fsspec.get_filesystem_class(protocol) options = cls._get_kwargs_from_urls(self.prefix_path) update_storage_options(options, storage_options) fs = cls(**options) return fs, protocol
def _get_protocol_path(self, urlpath) -> Tuple[str, List[str]]: if isinstance(urlpath, str): return split_protocol(urlpath) protocols, paths = zip(*map(split_protocol, urlpath)) assert (len(set(protocols)) == 1 ), "Cannot mix file protocols in a single operation" return protocols[0], list(paths)
def upload_output_file(url): """Takes a remote URL as input, returns a temp filename, then uploads it when done.""" protocol, _ = split_protocol(url) if protocol is not None: fs = fsspec.filesystem(protocol) with tempfile.TemporaryDirectory() as tmpdir: local_fname = os.path.join(tmpdir, "tmpfile") yield local_fname fs.put(local_fname, url, recursive=True) else: yield url
def get_dir(path): if '://' in path: protocol, _ = split_protocol(path) out = get_filesystem_class(protocol)._parent(path) if "://" not in out: # some FSs strip this, some do not out = protocol + "://" + out return out path = make_path_posix(os.path.join(os.getcwd(), os.path.dirname(path))) if path[-1] != '/': path += '/' return path
def upload_h5(url): protocol, _ = split_protocol(url) if protocol is not None: fs = fsspec.filesystem(protocol) with tempfile.TemporaryDirectory() as tmpdir: local_fname = os.path.join(tmpdir, 'file.h5') with h5py.File(local_fname, 'w') as f: yield f fs.put(local_fname, url, recursive=True) else: mode = 'r+' if path_exists(url) else 'w' with h5py.File(url, mode) as f: yield f
def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> pa.Table: uri = flyte_value.uri if not ctx.file_access.is_remote(uri): Path(uri).parent.mkdir(parents=True, exist_ok=True) _, path = split_protocol(uri) columns = None if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [c.name for c in current_task_metadata.structured_dataset_type.columns] try: fp = FSSpecPersistence(data_config=ctx.file_access.data_config) fs = fp.get_filesystem(uri) return pq.read_table(path, filesystem=fs, columns=columns) except NoCredentialsError as e: logger.debug("S3 source detected, attempting anonymous S3 access") fs = FSSpecPersistence.get_anonymous_filesystem(uri) if fs is not None: return pq.read_table(path, filesystem=fs, columns=columns) raise e
def _get_fs_and_protocol(self): protocol, path = split_protocol(self.prefix_path) fs = fsspec.filesystem(protocol, **self.storage_options) return fs, protocol
def get_path(path): protocol, _ = split_protocol(path) if protocol is not None: return path return pathlib.Path(os.path.abspath(path)).as_uri()
def get_localized_path(self, path): _, lpath = split_protocol(path) return lpath
def upgrade_http(urlpath): protocol, url = split_protocol(urlpath) if protocol == "http": return "https://" + url return None
def is_http(urlpath): protocol, _ = split_protocol(urlpath) return protocol == "http" or protocol == "https"
def has_remote_protocol(url): protocol, _ = split_protocol(url) return protocol and protocol != "file"
def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) serialization_mode = read_options.pop('serialization_mode', False) if serialization_mode: parsed = urlparse(path) try: fs = fsspec.filesystem(parsed.scheme) except ValueError as e: report_status("error", str(e)) raise meta_file = f"{path}_{proc_index}.meta" blob_file = f"{path}_{proc_index}" if not fs.exists(meta_file) or not fs.exists(blob_file): report_status( "error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}" ) raise FileNotFoundError('{}, {}'.format(meta_file, blob_file)) # Used for read bytes of serialized graph meta_file = fsspec.open(meta_file, mode="rb", **storage_options) with meta_file as f: meta = f.read().decode('utf-8') meta = json.loads(meta) lengths = meta.pop("lengths") for k, v in meta.items(): builder[k] = v stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) of = fsspec.open(blob_file, mode="rb", **storage_options) with of as f: try: total_size = f.size() except TypeError: total_size = f.size assert total_size == sum(lengths), "Target file is corrupted" for length in lengths: buf = f.read(length) chunk = writer.next(length) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish() else: # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception as e: report_status("error", f"Cannot initialize such filesystem for '{path}'") raise if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except: report_status("error", f"Cannot find such files for '{path}'") raise ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' chunk_size = 1024 * 1024 * 4 for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode( "unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish()
def get_fs_and_path(url): protocol, path = split_protocol(url) fs = fsspec.filesystem(protocol) return fs, path
def read_bytes( # noqa: C901 vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) params = dict() read_block_delimiter = read_options.pop('read_block_delimiter', '\n') if read_block_delimiter is not None: read_block_delimiter = read_block_delimiter.encode('utf-8') # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): params[k] = "1" if v else "0" elif k == "delimiter": params[k] = bytes(v, "utf-8").decode("unicode_escape") else: params[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception: report_error( f"Cannot initialize such filesystem for '{path}', " f"exception is:\n{traceback.format_exc()}" ) sys.exit(-1) if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except Exception: report_error(f"Cannot find such files for '{path}'") sys.exit(-1) ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' stream, writer = None, None if 'chunk_size' in storage_options: chunk_size = parse_readable_size(storage_options['chunk_size']) else: chunk_size = 1024 * 1024 * 64 # default: 64MB try: for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: if header_row: header_line = read_block(f, 0, 1, read_block_delimiter) params["header_line"] = header_line.decode("unicode_escape") offset = len(header_line) stream = ByteStream.new(client, params) client.persist(stream.id) report_success(stream.id) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buffer = read_block( f, begin, min(chunk_size, end - begin), delimiter=read_block_delimiter, ) size = len(buffer) if size <= 0: break begin += size - 1 chunk = writer.next(size) vineyard.memory_copy(chunk, 0, buffer) writer.finish() except Exception: report_exception() if writer is not None: writer.fail() sys.exit(-1)
def move_asset_file_to_item(item, asset_href, asset_subdirectory=None, copy=False, ignore_conflicts=False): """Moves an asset file to be alongside that item. Args: item (Item): The PySTAC Item to perform the asset transformation on. asset_href (str): The absolute HREF to the asset file. asset_subdirectory (str or None): A subdirectory that will be used to store the assets. If not supplied, the assets will be moved or copied to the same directory as their item. copy (bool): If False this function will move the asset file; if True, the asset file will be copied. ignore_conflicts (bool): If the asset destination file already exists, this function will throw an error unless ignore_conflicts is True. Returns: str: The new absolute href for the asset file """ item_href = item.get_self_href() if item_href is None: raise ValueError( 'Self HREF is not available for item {}. This operation ' 'requires that the Item HREFs are available.') if not is_absolute_href(asset_href): raise ValueError('asset_href msut be absolute.') item_dir = os.path.dirname(item_href) fname = os.path.basename(asset_href) if asset_subdirectory is None: target_dir = item_dir else: target_dir = os.path.join(item_dir, asset_subdirectory) new_asset_href = os.path.join(target_dir, fname) if asset_href != new_asset_href: dest_protocol = split_protocol(new_asset_href)[0] fs_dest = get_filesystem_class(dest_protocol)() op = None if fs_dest.exists(new_asset_href): if not ignore_conflicts: raise FileExistsError( '{} already exists'.format(new_asset_href)) else: if copy: def _op1(dry_run=False): logger.info("Copying {} to {}...".format( asset_href, new_asset_href)) if not dry_run: fs_dest.makedirs(os.path.dirname(new_asset_href), exist_ok=True) with fsspec.open(asset_href, 'rb') as f_src: with fsspec.open(new_asset_href, 'wb') as f_dst: f_dst.write(f_src.read()) op = _op1 else: source_protocol = split_protocol(asset_href)[0] if source_protocol == dest_protocol: def _op2(dry_run=False): logger.info("Moving {} to {}...".format( asset_href, new_asset_href)) if not dry_run: fs_dest.makedirs(os.path.dirname(new_asset_href), exist_ok=True) fs_dest.move(asset_href, new_asset_href) op = _op2 else: def _op3(dry_run=False): logger.info("Moving {} to {}...".format( asset_href, new_asset_href)) if not dry_run: fs_source = get_filesystem_class(source_protocol)() fs_dest.makedirs(os.path.dirname(new_asset_href), exist_ok=True) with fsspec.open(asset_href, 'rb') as f_src: with fsspec.open(new_asset_href, 'wb') as f_dst: f_dst.write(f_src.read()) fs_source.delete(asset_href) op = _op3 if op is not None: op(dry_run=False) return new_asset_href