def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver='libhdfs', extra_conf=None, **kwargs): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) self.pahdfs = HadoopFileSystem(host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf)
class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which passes on all calls to the underlying class. """ def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver="libhdfs", extra_conf=None, **kwargs): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ if self._cached: return AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) self.pahdfs = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf, ) def _open(self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- HDFSFile file-like instance """ return HDFSFile(self, path, mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs) def __reduce_ex__(self, protocol): return PyArrowHDFS, self.pars def ls(self, path, detail=True): out = self.pahdfs.ls(path, detail) if detail: for p in out: p["type"] = p["kind"] p["name"] = self._strip_protocol(p["name"]) else: out = [self._strip_protocol(p) for p in out] return out @staticmethod def _get_kwargs_from_urls(paths): ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["host"] = ops["host"] if ops.get("username", None): out["user"] = ops["username"] if ops.get("port", None): out["port"] = ops["port"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops["path"] def __getattribute__(self, item): if item in [ "_open", "__init__", "__getattribute__", "__reduce_ex__", "open", "ls", "makedirs", ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) if item == "__class__": return PyArrowHDFS d = object.__getattribute__(self, "__dict__") pahdfs = d.get("pahdfs", None) # fs is not immediately defined if pahdfs is not None and item in [ "chmod", "chown", "user", "df", "disk_usage", "download", "driver", "exists", "extra_conf", "get_capacity", "get_space_used", "host", "is_open", "kerb_ticket", "strip_protocol", "mkdir", "mv", "port", "get_capacity", "get_space_used", "df", "chmod", "chown", "disk_usage", "download", "upload", "_get_kwargs_from_urls", "read_parquet", "rm", "stat", "upload", ]: return getattr(pahdfs, item) else: # attributes of the superclass, while target is being set up return super().__getattribute__(item)
class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which passes on all calls to the underlying class. """ def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver='libhdfs', extra_conf=None, **kwargs): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) self.pahdfs = HadoopFileSystem(host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf) def _open(self, path, mode='rb', block_size=None, autocommit=True, **kwargs): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- HDFSFile file-like instance """ if not autocommit: raise NotImplementedError return HDFSFile(self, path, mode, block_size, **kwargs) def __reduce_ex__(self, protocol): return PyArrowHDFS, self.pars def ls(self, path, detail=True): out = self.pahdfs.ls(path, detail) if detail: for p in out: p['type'] = p['kind'] return out @staticmethod def _get_kwargs_from_urls(paths): ops = infer_storage_options(paths) out = {} if ops.get('host', None): out['host'] = ops['host'] if ops.get('username', None): out['user'] = ops['username'] if ops.get('port', None): out['port'] = ops['port'] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops['path'] def __getattribute__(self, item): if item in [ '_open', '__init__', '__getattribute__', '__reduce_ex__', 'open', 'ls', 'makedirs' ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) if item == '__class__': return PyArrowHDFS d = object.__getattribute__(self, '__dict__') pahdfs = d.get('pahdfs', None) # fs is not immediately defined if pahdfs is not None and item in [ 'chmod', 'chown', 'user', 'df', 'disk_usage', 'download', 'driver', 'exists', 'extra_conf', 'get_capacity', 'get_space_used', 'host', 'is_open', 'kerb_ticket', 'strip_protocol', 'mkdir', 'mv', 'port', 'get_capacity', 'get_space_used', 'df', 'chmod', 'chown', 'disk_usage', 'download', 'upload', '_get_kwargs_from_urls', 'read_parquet', 'rm', 'stat', 'upload', ]: return getattr(pahdfs, item) else: # attributes of the superclass, while target is being set up return super().__getattribute__(item)
class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over the pyarrow.hdfs.HadoopFileSystem, which passes on all calls to the underlying class.""" protocol = "hdfs", "file" def __init__( self, host="default", port=0, user=None, kerb_ticket=None, driver="libhdfs", extra_conf=None, **kwargs, ): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ super().__init__(**kwargs) self.client = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf, ) weakref.finalize(self, lambda: self.client.close()) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) @staticmethod def _get_kwargs_from_urls(path): ops = infer_storage_options(path) out = {} if ops.get("host", None): out["host"] = ops["host"] if ops.get("username", None): out["user"] = ops["username"] if ops.get("port", None): out["port"] = ops["port"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) path = ops["path"] # infer_store_options leaves file:/ prefixes alone # for local hdfs instances if path.startswith("file:"): path = path[5:] return path def __reduce_ex__(self, protocol): return PyArrowHDFS, self.pars def close(self): self.client.close() @wrap_exceptions def ls(self, path, detail=True): listing = [ self._adjust_entry(entry) for entry in self.client.ls(path, detail=True) ] if detail: return listing else: return [entry["name"] for entry in listing] @wrap_exceptions def info(self, path): return self._adjust_entry(self.client.info(path)) def _adjust_entry(self, original_entry): entry = original_entry.copy() if "type" not in entry: if "kind" in entry: entry["type"] = entry["kind"] if "name" not in entry: if "path" in entry: entry["name"] = entry["path"] if "name" in entry: entry["name"] = self._strip_protocol(entry["name"]) return entry @wrap_exceptions def cp_file(self, lpath, rpath, **kwargs): if self.isdir(lpath): self.makedirs(rpath) return with self.open(lpath) as lstream: tmp_fname = "/".join( [self._parent(rpath), f".tmp.{secrets.token_hex(16)}"]) # Perform an atomic copy (stream to a temporary file and # move it to the actual destination). try: with self.open(tmp_fname, "wb") as rstream: shutil.copyfileobj(lstream, rstream) self.client.mv(tmp_fname, rpath) except BaseException: # noqa with suppress(FileNotFoundError): self.client.rm(tmp_fname) raise @wrap_exceptions def rm_file(self, path): return self.client.rm(path) @wrap_exceptions def makedirs(self, path, exist_ok=False): if not exist_ok and self.exists(path): raise FileExistsError(path) return self.client.mkdir(path, create_parents=True) @wrap_exceptions def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs, ): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- HDFSFile file-like instance """ return HDFSFile( self, path, mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs, )
return 'hdfs' _HADOOP_CONF_DIR_ENV_VAR_NAME: str = 'HADOOP_CONF_DIR' # check if running on Linux cluster or local Mac _ON_LINUX_CLUSTER: bool = sys.platform.startswith('linux') # detect & set up HDFS client if _HADOOP_HOME: os.environ['ARROW_LIBHDFS_DIR'] = \ str(Path(_HADOOP_HOME).resolve(strict=True) / 'lib' / 'native') try: HDFS_CLIENT = HadoopFileSystem() try: _LOGGER.debug(msg=(msg := 'Testing HDFS...')) if HDFS_CLIENT.isdir(path='/'): _ON_LINUX_CLUSTER_WITH_HDFS: bool = True _LOGGER.debug(msg=f'{msg} done!') else: _ON_LINUX_CLUSTER_WITH_HDFS: bool = False _LOGGER.debug(msg=f'{msg} UNAVAILABLE') except Exception: # pylint: disable=broad-except HDFS_CLIENT = None _ON_LINUX_CLUSTER_WITH_HDFS: bool = False
class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which passes on all calls to the underlying class. """ def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver='libhdfs', extra_conf=None): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ self.driver = HadoopFileSystem(host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf) def _open(self, path, mode='rb', block_size=None, autocommit=True, **kwargs): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- arrow HdfsFile file-like instance """ if not autocommit: raise NotImplementedError return self.driver.open(path, mode, block_size, **kwargs) def __getattr__(self, item): if item in [ 'chmod', 'chown', 'df', 'disk_usage', 'download', 'driver', 'exists', 'extra_conf', 'get_capacity', 'get_space_used', 'host', 'info', 'is_open', 'isdir', 'isfile', 'kerb_ticket', 'ls', 'mkdir', 'mv', 'port', 'read_parquet', 'rm', 'stat', 'upload', 'user', 'walk' ]: return getattr(self.driver, item)
import pyarrow import numpy as np import scipy import scipy.optimize from pyarrow.hdfs import HadoopFileSystem try: # Activate JNI load `$HADOOP_HOME/lib/native/libhdfs.so` in runtime HadoopFileSystem('foo') except: pass evs = np.array([ 0.01855396, 0.02888079, 0.01484719, 0.01187566, 0.01350127, 0.0152477, 0.02978069, 0.01184938, 0.0152477, 0.01967369, 0.02334463, -0.00964757, -0.0084154, 0.0093229, 0.00074653 ]) A_eq = np.array([[ -0.17128674, 0.17588126, -0.21854693, 0.35221215, 0.32877443, 0.35090059, -0.28819657, -0.17272982, 0.35090059, 0.32671732, -0.13842946, 0.23981023, 0.1866889, 0.15406733, 0.24219247 ], [ 0.27321495, -0.28669058, 0.355471, 0.24540659, 0.16261506, 0.24417405, -0.20448798, 0.27555701, 0.24417405, 0.16159759, -0.19235484, -0.38261073, -0.30371767, -0.25482233, -0.16266994 ]]) b_eq = [0, 0]
_HADOOP_CONF_DIR_ENV_VAR_NAME = 'HADOOP_CONF_DIR' # check if running on Linux cluster or local Mac _ON_LINUX_CLUSTER = sys.platform.startswith('linux') # detect & set up HDFS client if _HADOOP_HOME: os.environ['ARROW_LIBHDFS_DIR'] = \ os.path.join( _HADOOP_HOME, 'lib', 'native') try: hdfs_client = HadoopFileSystem() try: print('Testing HDFS... ', end='') if hdfs_client.isdir('/'): _ON_LINUX_CLUSTER_WITH_HDFS = True print('done!') else: _ON_LINUX_CLUSTER_WITH_HDFS = False print('UNAVAILABLE') except: hdfs_client = None _ON_LINUX_CLUSTER_WITH_HDFS = False