Ejemplo n.º 1
0
    def __init__(self,
                 host="default",
                 port=0,
                 user=None,
                 kerb_ticket=None,
                 driver='libhdfs',
                 extra_conf=None,
                 **kwargs):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        driver: 'libhdfs' or 'libhdfs3'
            Binary driver; libhdfs if the JNI library and default
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        AbstractFileSystem.__init__(self, **kwargs)
        self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
        self.pahdfs = HadoopFileSystem(host=host,
                                       port=port,
                                       user=user,
                                       kerb_ticket=kerb_ticket,
                                       driver=driver,
                                       extra_conf=extra_conf)
Ejemplo n.º 2
0
class PyArrowHDFS(AbstractFileSystem):
    """Adapted version of Arrow's HadoopFileSystem

    This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
    passes on all calls to the underlying class.
    """
    def __init__(self,
                 host="default",
                 port=0,
                 user=None,
                 kerb_ticket=None,
                 driver="libhdfs",
                 extra_conf=None,
                 **kwargs):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        driver: 'libhdfs' or 'libhdfs3'
            Binary driver; libhdfs if the JNI library and default
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        if self._cached:
            return
        AbstractFileSystem.__init__(self, **kwargs)
        self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
        self.pahdfs = HadoopFileSystem(
            host=host,
            port=port,
            user=user,
            kerb_ticket=kerb_ticket,
            driver=driver,
            extra_conf=extra_conf,
        )

    def _open(self,
              path,
              mode="rb",
              block_size=None,
              autocommit=True,
              cache_options=None,
              **kwargs):
        """

        Parameters
        ----------
        path: str
            Location of file; should start with '/'
        mode: str
        block_size: int
            Hadoop block size, e.g., 2**26
        autocommit: True
            Transactions are not yet implemented for HDFS; errors if not True
        kwargs: dict or None
            Hadoop config parameters

        Returns
        -------
        HDFSFile file-like instance
        """

        return HDFSFile(self,
                        path,
                        mode,
                        block_size=block_size,
                        autocommit=autocommit,
                        cache_options=cache_options,
                        **kwargs)

    def __reduce_ex__(self, protocol):
        return PyArrowHDFS, self.pars

    def ls(self, path, detail=True):
        out = self.pahdfs.ls(path, detail)
        if detail:
            for p in out:
                p["type"] = p["kind"]
                p["name"] = self._strip_protocol(p["name"])
        else:
            out = [self._strip_protocol(p) for p in out]
        return out

    @staticmethod
    def _get_kwargs_from_urls(paths):
        ops = infer_storage_options(paths)
        out = {}
        if ops.get("host", None):
            out["host"] = ops["host"]
        if ops.get("username", None):
            out["user"] = ops["username"]
        if ops.get("port", None):
            out["port"] = ops["port"]
        return out

    @classmethod
    def _strip_protocol(cls, path):
        ops = infer_storage_options(path)
        return ops["path"]

    def __getattribute__(self, item):
        if item in [
                "_open",
                "__init__",
                "__getattribute__",
                "__reduce_ex__",
                "open",
                "ls",
                "makedirs",
        ]:
            # all the methods defined in this class. Note `open` here, since
            # it calls `_open`, but is actually in superclass
            return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args,
                                                                  **kw)
        if item == "__class__":
            return PyArrowHDFS
        d = object.__getattribute__(self, "__dict__")
        pahdfs = d.get("pahdfs", None)  # fs is not immediately defined
        if pahdfs is not None and item in [
                "chmod",
                "chown",
                "user",
                "df",
                "disk_usage",
                "download",
                "driver",
                "exists",
                "extra_conf",
                "get_capacity",
                "get_space_used",
                "host",
                "is_open",
                "kerb_ticket",
                "strip_protocol",
                "mkdir",
                "mv",
                "port",
                "get_capacity",
                "get_space_used",
                "df",
                "chmod",
                "chown",
                "disk_usage",
                "download",
                "upload",
                "_get_kwargs_from_urls",
                "read_parquet",
                "rm",
                "stat",
                "upload",
        ]:
            return getattr(pahdfs, item)
        else:
            # attributes of the superclass, while target is being set up
            return super().__getattribute__(item)
Ejemplo n.º 3
0
class PyArrowHDFS(AbstractFileSystem):
    """Adapted version of Arrow's HadoopFileSystem

    This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
    passes on all calls to the underlying class.
    """
    def __init__(self,
                 host="default",
                 port=0,
                 user=None,
                 kerb_ticket=None,
                 driver='libhdfs',
                 extra_conf=None,
                 **kwargs):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        driver: 'libhdfs' or 'libhdfs3'
            Binary driver; libhdfs if the JNI library and default
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        AbstractFileSystem.__init__(self, **kwargs)
        self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
        self.pahdfs = HadoopFileSystem(host=host,
                                       port=port,
                                       user=user,
                                       kerb_ticket=kerb_ticket,
                                       driver=driver,
                                       extra_conf=extra_conf)

    def _open(self,
              path,
              mode='rb',
              block_size=None,
              autocommit=True,
              **kwargs):
        """

        Parameters
        ----------
        path: str
            Location of file; should start with '/'
        mode: str
        block_size: int
            Hadoop block size, e.g., 2**26
        autocommit: True
            Transactions are not yet implemented for HDFS; errors if not True
        kwargs: dict or None
            Hadoop config parameters

        Returns
        -------
        HDFSFile file-like instance
        """
        if not autocommit:
            raise NotImplementedError
        return HDFSFile(self, path, mode, block_size, **kwargs)

    def __reduce_ex__(self, protocol):
        return PyArrowHDFS, self.pars

    def ls(self, path, detail=True):
        out = self.pahdfs.ls(path, detail)
        if detail:
            for p in out:
                p['type'] = p['kind']
        return out

    @staticmethod
    def _get_kwargs_from_urls(paths):
        ops = infer_storage_options(paths)
        out = {}
        if ops.get('host', None):
            out['host'] = ops['host']
        if ops.get('username', None):
            out['user'] = ops['username']
        if ops.get('port', None):
            out['port'] = ops['port']
        return out

    @classmethod
    def _strip_protocol(cls, path):
        ops = infer_storage_options(path)
        return ops['path']

    def __getattribute__(self, item):
        if item in [
                '_open', '__init__', '__getattribute__', '__reduce_ex__',
                'open', 'ls', 'makedirs'
        ]:
            # all the methods defined in this class. Note `open` here, since
            # it calls `_open`, but is actually in superclass
            return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args,
                                                                  **kw)
        if item == '__class__':
            return PyArrowHDFS
        d = object.__getattribute__(self, '__dict__')
        pahdfs = d.get('pahdfs', None)  # fs is not immediately defined
        if pahdfs is not None and item in [
                'chmod',
                'chown',
                'user',
                'df',
                'disk_usage',
                'download',
                'driver',
                'exists',
                'extra_conf',
                'get_capacity',
                'get_space_used',
                'host',
                'is_open',
                'kerb_ticket',
                'strip_protocol',
                'mkdir',
                'mv',
                'port',
                'get_capacity',
                'get_space_used',
                'df',
                'chmod',
                'chown',
                'disk_usage',
                'download',
                'upload',
                '_get_kwargs_from_urls',
                'read_parquet',
                'rm',
                'stat',
                'upload',
        ]:
            return getattr(pahdfs, item)
        else:
            # attributes of the superclass, while target is being set up
            return super().__getattribute__(item)
Ejemplo n.º 4
0
class PyArrowHDFS(AbstractFileSystem):
    """Adapted version of Arrow's HadoopFileSystem

    This is a very simple wrapper over the pyarrow.hdfs.HadoopFileSystem, which
    passes on all calls to the underlying class."""

    protocol = "hdfs", "file"

    def __init__(
        self,
        host="default",
        port=0,
        user=None,
        kerb_ticket=None,
        driver="libhdfs",
        extra_conf=None,
        **kwargs,
    ):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        driver: 'libhdfs' or 'libhdfs3'
            Binary driver; libhdfs if the JNI library and default
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        super().__init__(**kwargs)

        self.client = HadoopFileSystem(
            host=host,
            port=port,
            user=user,
            kerb_ticket=kerb_ticket,
            driver=driver,
            extra_conf=extra_conf,
        )
        weakref.finalize(self, lambda: self.client.close())

        self.pars = (host, port, user, kerb_ticket, driver, extra_conf)

    @staticmethod
    def _get_kwargs_from_urls(path):
        ops = infer_storage_options(path)
        out = {}
        if ops.get("host", None):
            out["host"] = ops["host"]
        if ops.get("username", None):
            out["user"] = ops["username"]
        if ops.get("port", None):
            out["port"] = ops["port"]
        return out

    @classmethod
    def _strip_protocol(cls, path):
        ops = infer_storage_options(path)
        path = ops["path"]
        # infer_store_options leaves file:/ prefixes alone
        # for local hdfs instances
        if path.startswith("file:"):
            path = path[5:]
        return path

    def __reduce_ex__(self, protocol):
        return PyArrowHDFS, self.pars

    def close(self):
        self.client.close()

    @wrap_exceptions
    def ls(self, path, detail=True):
        listing = [
            self._adjust_entry(entry)
            for entry in self.client.ls(path, detail=True)
        ]

        if detail:
            return listing
        else:
            return [entry["name"] for entry in listing]

    @wrap_exceptions
    def info(self, path):
        return self._adjust_entry(self.client.info(path))

    def _adjust_entry(self, original_entry):
        entry = original_entry.copy()
        if "type" not in entry:
            if "kind" in entry:
                entry["type"] = entry["kind"]
        if "name" not in entry:
            if "path" in entry:
                entry["name"] = entry["path"]

        if "name" in entry:
            entry["name"] = self._strip_protocol(entry["name"])
        return entry

    @wrap_exceptions
    def cp_file(self, lpath, rpath, **kwargs):
        if self.isdir(lpath):
            self.makedirs(rpath)
            return

        with self.open(lpath) as lstream:
            tmp_fname = "/".join(
                [self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
            # Perform an atomic copy (stream to a temporary file and
            # move it to the actual destination).
            try:
                with self.open(tmp_fname, "wb") as rstream:
                    shutil.copyfileobj(lstream, rstream)
                self.client.mv(tmp_fname, rpath)
            except BaseException:  # noqa
                with suppress(FileNotFoundError):
                    self.client.rm(tmp_fname)
                raise

    @wrap_exceptions
    def rm_file(self, path):
        return self.client.rm(path)

    @wrap_exceptions
    def makedirs(self, path, exist_ok=False):
        if not exist_ok and self.exists(path):
            raise FileExistsError(path)

        return self.client.mkdir(path, create_parents=True)

    @wrap_exceptions
    def _open(
        self,
        path,
        mode="rb",
        block_size=None,
        autocommit=True,
        cache_options=None,
        **kwargs,
    ):
        """

        Parameters
        ----------
        path: str
            Location of file; should start with '/'
        mode: str
        block_size: int
            Hadoop block size, e.g., 2**26
        autocommit: True
            Transactions are not yet implemented for HDFS; errors if not True
        kwargs: dict or None
            Hadoop config parameters

        Returns
        -------
        HDFSFile file-like instance
        """

        return HDFSFile(
            self,
            path,
            mode,
            block_size=block_size,
            autocommit=autocommit,
            cache_options=cache_options,
            **kwargs,
        )
Ejemplo n.º 5
0
    return 'hdfs'


_HADOOP_CONF_DIR_ENV_VAR_NAME: str = 'HADOOP_CONF_DIR'

# check if running on Linux cluster or local Mac
_ON_LINUX_CLUSTER: bool = sys.platform.startswith('linux')

# detect & set up HDFS client
if _HADOOP_HOME:
    os.environ['ARROW_LIBHDFS_DIR'] = \
        str(Path(_HADOOP_HOME).resolve(strict=True) / 'lib' / 'native')

    try:
        HDFS_CLIENT = HadoopFileSystem()

        try:
            _LOGGER.debug(msg=(msg := 'Testing HDFS...'))

            if HDFS_CLIENT.isdir(path='/'):
                _ON_LINUX_CLUSTER_WITH_HDFS: bool = True
                _LOGGER.debug(msg=f'{msg} done!')

            else:
                _ON_LINUX_CLUSTER_WITH_HDFS: bool = False
                _LOGGER.debug(msg=f'{msg} UNAVAILABLE')

        except Exception:  # pylint: disable=broad-except
            HDFS_CLIENT = None
            _ON_LINUX_CLUSTER_WITH_HDFS: bool = False
Ejemplo n.º 6
0
class PyArrowHDFS(AbstractFileSystem):
    """Adapted version of Arrow's HadoopFileSystem

    This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
    passes on all calls to the underlying class.
    """
    def __init__(self,
                 host="default",
                 port=0,
                 user=None,
                 kerb_ticket=None,
                 driver='libhdfs',
                 extra_conf=None):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        driver: 'libhdfs' or 'libhdfs3'
            Binary driver; libhdfs if the JNI library and default
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        self.driver = HadoopFileSystem(host=host,
                                       port=port,
                                       user=user,
                                       kerb_ticket=kerb_ticket,
                                       driver=driver,
                                       extra_conf=extra_conf)

    def _open(self,
              path,
              mode='rb',
              block_size=None,
              autocommit=True,
              **kwargs):
        """

        Parameters
        ----------
        path: str
            Location of file; should start with '/'
        mode: str
        block_size: int
            Hadoop block size, e.g., 2**26
        autocommit: True
            Transactions are not yet implemented for HDFS; errors if not True
        kwargs: dict or None
            Hadoop config parameters

        Returns
        -------
        arrow HdfsFile file-like instance
        """
        if not autocommit:
            raise NotImplementedError
        return self.driver.open(path, mode, block_size, **kwargs)

    def __getattr__(self, item):
        if item in [
                'chmod', 'chown', 'df', 'disk_usage', 'download', 'driver',
                'exists', 'extra_conf', 'get_capacity', 'get_space_used',
                'host', 'info', 'is_open', 'isdir', 'isfile', 'kerb_ticket',
                'ls', 'mkdir', 'mv', 'port', 'read_parquet', 'rm', 'stat',
                'upload', 'user', 'walk'
        ]:
            return getattr(self.driver, item)
Ejemplo n.º 7
0
import pyarrow
import numpy as np
import scipy
import scipy.optimize
from pyarrow.hdfs import HadoopFileSystem

try:
    # Activate JNI load `$HADOOP_HOME/lib/native/libhdfs.so` in runtime
    HadoopFileSystem('foo')
except:
    pass

evs = np.array([
    0.01855396, 0.02888079, 0.01484719, 0.01187566, 0.01350127, 0.0152477,
    0.02978069, 0.01184938, 0.0152477, 0.01967369, 0.02334463, -0.00964757,
    -0.0084154, 0.0093229, 0.00074653
])

A_eq = np.array([[
    -0.17128674, 0.17588126, -0.21854693, 0.35221215, 0.32877443, 0.35090059,
    -0.28819657, -0.17272982, 0.35090059, 0.32671732, -0.13842946, 0.23981023,
    0.1866889, 0.15406733, 0.24219247
],
                 [
                     0.27321495, -0.28669058, 0.355471, 0.24540659, 0.16261506,
                     0.24417405, -0.20448798, 0.27555701, 0.24417405,
                     0.16159759, -0.19235484, -0.38261073, -0.30371767,
                     -0.25482233, -0.16266994
                 ]])
b_eq = [0, 0]
Ejemplo n.º 8
0
_HADOOP_CONF_DIR_ENV_VAR_NAME = 'HADOOP_CONF_DIR'

# check if running on Linux cluster or local Mac
_ON_LINUX_CLUSTER = sys.platform.startswith('linux')

# detect & set up HDFS client
if _HADOOP_HOME:
    os.environ['ARROW_LIBHDFS_DIR'] = \
        os.path.join(
            _HADOOP_HOME,
            'lib',
            'native')

    try:
        hdfs_client = HadoopFileSystem()

        try:
            print('Testing HDFS... ', end='')

            if hdfs_client.isdir('/'):
                _ON_LINUX_CLUSTER_WITH_HDFS = True
                print('done!')

            else:
                _ON_LINUX_CLUSTER_WITH_HDFS = False
                print('UNAVAILABLE')

        except:
            hdfs_client = None
            _ON_LINUX_CLUSTER_WITH_HDFS = False