Ejemplo n.º 1
0
def hdfs_fs():
    if should_skip(HDFS_URI):
        yield [None] * NUM_ATR_FS
        return

    from pyarrow.fs import HadoopFileSystem

    monkeypatch = pytest.MonkeyPatch()
    hdfs_host = os.environ.get("HDFS_HOST")
    hdfs_port = int(os.environ.get("HDFS_PORT", 9000))

    if hdfs_host is None:
        hdfs_host = socket.gethostbyname(socket.gethostname())

    hdfs = HadoopFileSystem(hdfs_host, hdfs_port)

    def path_to(*args):
        return (
            f"{HDFS_URI}://{hdfs_host}:{hdfs_port}/{posixpath.join(ROOT_PREFIX, *args)}"
        )

    def read(path):
        f = hdfs.open_input_stream(path)
        return f.readall()

    def write(path, body):
        with hdfs.open_output_stream(path) as f:
            f.write(body)

    def mkdirs(path):
        hdfs.create_dir(path, recursive=True)

    yield path_to, read, write, mkdirs, posixpath.join, None
    monkeypatch.undo()
Ejemplo n.º 2
0
    def __init__(
        self,
        host="default",
        port=0,
        user=None,
        kerb_ticket=None,
        extra_conf=None,
        **kwargs,
    ):
        """

        Parameters
        ----------
        host: str
            Hostname, IP or "default" to try to read from Hadoop config
        port: int
            Port to connect on, or default from Hadoop config if 0
        user: str or None
            If given, connect as this username
        kerb_ticket: str or None
            If given, use this ticket for authentication
        extra_conf: None or dict
            Passed on to HadoopFileSystem
        """
        from pyarrow.fs import HadoopFileSystem

        fs = HadoopFileSystem(
            host=host,
            port=port,
            user=user,
            kerb_ticket=kerb_ticket,
            extra_conf=extra_conf,
        )
        super().__init__(fs=fs, **kwargs)
Ejemplo n.º 3
0
def test_hdfs_options(hdfs_server):
    from pyarrow.fs import HdfsOptions, HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    options = HdfsOptions()
    assert options.endpoint == ('', 0)
    options.endpoint = ('localhost', 8080)
    assert options.endpoint == ('localhost', 8080)
    with pytest.raises(TypeError):
        options.endpoint = 'localhost:8000'

    assert options.driver == 'libhdfs'
    options.driver = 'libhdfs3'
    assert options.driver == 'libhdfs3'
    with pytest.raises(ValueError):
        options.driver = 'unknown'

    assert options.replication == 3
    options.replication = 2
    assert options.replication == 2

    assert options.user == ''
    options.user = '******'
    assert options.user == 'libhdfs'

    assert options.default_block_size == 0
    options.default_block_size = 128*1024**2
    assert options.default_block_size == 128*1024**2

    assert options.buffer_size == 0
    options.buffer_size = 64*1024
    assert options.buffer_size == 64*1024

    options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test')
    assert options.endpoint == ('hdfs://localhost', 8080)
    assert options.user == 'test'

    host, port, user = hdfs_server
    uri = "hdfs://{}:{}/?user={}".format(host, port, user)
    fs = HadoopFileSystem(uri)
    assert fs.get_target_stats(FileSelector('/'))
Ejemplo n.º 4
0
def hdfs(request, hdfs_connection):
    request.config.pyarrow.requires('hdfs')
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    from pyarrow.fs import HadoopFileSystem

    host, port, user = hdfs_connection
    fs = HadoopFileSystem(host, port=port, user=user)

    return dict(
        fs=fs,
        pathfn=lambda p: p,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
Ejemplo n.º 5
0
def hdfs(request, hdfs_server):
    request.config.pyarrow.requires('hdfs')
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    from pyarrow.fs import HdfsOptions, HadoopFileSystem

    host, port, user = hdfs_server
    options = HdfsOptions(endpoint=(host, port), user=user)

    fs = HadoopFileSystem(options)

    return dict(
        fs=fs,
        pathfn=lambda p: p,
        allow_copy_file=False,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
Ejemplo n.º 6
0
def test_hdfs_options(hdfs_connection):
    from pyarrow.fs import HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    host, port, user = hdfs_connection

    replication = 2
    buffer_size = 64 * 1024
    default_block_size = 128 * 1024**2
    uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}'
           '&default_block_size={}')

    hdfs1 = HadoopFileSystem(host,
                             port,
                             user='******',
                             replication=replication,
                             buffer_size=buffer_size,
                             default_block_size=default_block_size)
    hdfs2 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'libhdfs', replication, buffer_size,
                   default_block_size))
    hdfs3 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication, buffer_size,
                   default_block_size))
    hdfs4 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication + 1, buffer_size,
                   default_block_size))
    hdfs5 = HadoopFileSystem(host, port)
    hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port))
    hdfs7 = HadoopFileSystem(host, port, user='******')
    hdfs8 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket="cache_path")
    hdfs9 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket=pathlib.Path("cache_path"))
    hdfs10 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path2")
    hdfs11 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path",
                              extra_conf={'hdfs_token': 'abcd'})

    assert hdfs1 == hdfs2
    assert hdfs5 == hdfs6
    assert hdfs6 != hdfs7
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs5
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs8
    assert hdfs8 == hdfs9
    assert hdfs10 != hdfs9
    assert hdfs11 != hdfs8

    with pytest.raises(TypeError):
        HadoopFileSystem()
    with pytest.raises(TypeError):
        HadoopFileSystem.from_uri(3)

    for fs in [
            hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9,
            hdfs10, hdfs11
    ]:
        assert pickle.loads(pickle.dumps(fs)) == fs

    host, port, user = hdfs_connection

    hdfs = HadoopFileSystem(host, port, user=user)
    assert hdfs.get_file_info(FileSelector('/'))

    hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format(
        host, port, user))
    assert hdfs.get_file_info(FileSelector('/'))
Ejemplo n.º 7
0
def test_hdfs_options(hdfs_connection):
    from pyarrow.fs import HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    host, port, user = hdfs_connection

    replication = 2
    buffer_size = 64*1024
    default_block_size = 128*1024**2
    uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}'
           '&default_block_size={}')

    hdfs1 = HadoopFileSystem(host, port, user='******',
                             replication=replication, buffer_size=buffer_size,
                             default_block_size=default_block_size)
    hdfs2 = HadoopFileSystem.from_uri(uri.format(
        host, port, 'libhdfs', replication, buffer_size, default_block_size
    ))
    hdfs3 = HadoopFileSystem.from_uri(uri.format(
        host, port, 'me', replication, buffer_size, default_block_size
    ))
    hdfs4 = HadoopFileSystem.from_uri(uri.format(
        host, port, 'me', replication + 1, buffer_size, default_block_size
    ))
    hdfs5 = HadoopFileSystem(host, port)
    hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port))
    hdfs7 = HadoopFileSystem(host, port, user='******')

    assert hdfs1 == hdfs2
    assert hdfs5 == hdfs6
    assert hdfs6 != hdfs7
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs5
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    with pytest.raises(TypeError):
        HadoopFileSystem()
    with pytest.raises(TypeError):
        HadoopFileSystem.from_uri(3)

    assert pickle.loads(pickle.dumps(hdfs1)) == hdfs1

    host, port, user = hdfs_connection

    hdfs = HadoopFileSystem(host, port, user=user)
    assert hdfs.get_file_info(FileSelector('/'))

    hdfs = HadoopFileSystem.from_uri(
        "hdfs://{}:{}/?user={}".format(host, port, user)
    )
    assert hdfs.get_file_info(FileSelector('/'))
Ejemplo n.º 8
0
 def _get_client(self):
     if self._client is None:
         self._client = HadoopFileSystem()
     return self._client
Ejemplo n.º 9
0
 def _get_client(self):
     if self._client is None:
         self._client = HadoopFileSystem(host="default")
     return self._client