def __init__( self, host="default", port=0, user=None, kerb_ticket=None, driver="libhdfs", extra_conf=None, **kwargs, ): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ if self._cached: return AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) pahdfs = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf, ) weakref.finalize(self, lambda: pahdfs.close()) self.pahdfs = pahdfs
class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over the pyarrow.hdfs.HadoopFileSystem, which passes on all calls to the underlying class.""" protocol = "hdfs", "file" def __init__( self, host="default", port=0, user=None, kerb_ticket=None, driver="libhdfs", extra_conf=None, **kwargs, ): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ super().__init__(**kwargs) self.client = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf, ) weakref.finalize(self, lambda: self.client.close()) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) @staticmethod def _get_kwargs_from_urls(path): ops = infer_storage_options(path) out = {} if ops.get("host", None): out["host"] = ops["host"] if ops.get("username", None): out["user"] = ops["username"] if ops.get("port", None): out["port"] = ops["port"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) path = ops["path"] # infer_store_options leaves file:/ prefixes alone # for local hdfs instances if path.startswith("file:"): path = path[5:] return path def __reduce_ex__(self, protocol): return PyArrowHDFS, self.pars def close(self): self.client.close() @wrap_exceptions def ls(self, path, detail=True): listing = [ self._adjust_entry(entry) for entry in self.client.ls(path, detail=True) ] if detail: return listing else: return [entry["name"] for entry in listing] @wrap_exceptions def info(self, path): return self._adjust_entry(self.client.info(path)) def _adjust_entry(self, original_entry): entry = original_entry.copy() if "type" not in entry: if "kind" in entry: entry["type"] = entry["kind"] if "name" not in entry: if "path" in entry: entry["name"] = entry["path"] if "name" in entry: entry["name"] = self._strip_protocol(entry["name"]) return entry @wrap_exceptions def cp_file(self, lpath, rpath, **kwargs): if self.isdir(lpath): self.makedirs(rpath) return with self.open(lpath) as lstream: tmp_fname = "/".join( [self._parent(rpath), f".tmp.{secrets.token_hex(16)}"]) # Perform an atomic copy (stream to a temporary file and # move it to the actual destination). try: with self.open(tmp_fname, "wb") as rstream: shutil.copyfileobj(lstream, rstream) self.client.mv(tmp_fname, rpath) except BaseException: # noqa with suppress(FileNotFoundError): self.client.rm(tmp_fname) raise @wrap_exceptions def rm_file(self, path): return self.client.rm(path) @wrap_exceptions def makedirs(self, path, exist_ok=False): if not exist_ok and self.exists(path): raise FileExistsError(path) return self.client.mkdir(path, create_parents=True) @wrap_exceptions def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs, ): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- HDFSFile file-like instance """ return HDFSFile( self, path, mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs, )