def hdfs_fs(): if should_skip(HDFS_URI): yield [None] * NUM_ATR_FS return from pyarrow.fs import HadoopFileSystem monkeypatch = pytest.MonkeyPatch() hdfs_host = os.environ.get("HDFS_HOST") hdfs_port = int(os.environ.get("HDFS_PORT", 9000)) if hdfs_host is None: hdfs_host = socket.gethostbyname(socket.gethostname()) hdfs = HadoopFileSystem(hdfs_host, hdfs_port) def path_to(*args): return ( f"{HDFS_URI}://{hdfs_host}:{hdfs_port}/{posixpath.join(ROOT_PREFIX, *args)}" ) def read(path): f = hdfs.open_input_stream(path) return f.readall() def write(path, body): with hdfs.open_output_stream(path) as f: f.write(body) def mkdirs(path): hdfs.create_dir(path, recursive=True) yield path_to, read, write, mkdirs, posixpath.join, None monkeypatch.undo()
def __init__( self, host="default", port=0, user=None, kerb_ticket=None, extra_conf=None, **kwargs, ): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication extra_conf: None or dict Passed on to HadoopFileSystem """ from pyarrow.fs import HadoopFileSystem fs = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, extra_conf=extra_conf, ) super().__init__(fs=fs, **kwargs)
def test_hdfs_options(hdfs_server): from pyarrow.fs import HdfsOptions, HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') options = HdfsOptions() assert options.endpoint == ('', 0) options.endpoint = ('localhost', 8080) assert options.endpoint == ('localhost', 8080) with pytest.raises(TypeError): options.endpoint = 'localhost:8000' assert options.driver == 'libhdfs' options.driver = 'libhdfs3' assert options.driver == 'libhdfs3' with pytest.raises(ValueError): options.driver = 'unknown' assert options.replication == 3 options.replication = 2 assert options.replication == 2 assert options.user == '' options.user = '******' assert options.user == 'libhdfs' assert options.default_block_size == 0 options.default_block_size = 128*1024**2 assert options.default_block_size == 128*1024**2 assert options.buffer_size == 0 options.buffer_size = 64*1024 assert options.buffer_size == 64*1024 options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test') assert options.endpoint == ('hdfs://localhost', 8080) assert options.user == 'test' host, port, user = hdfs_server uri = "hdfs://{}:{}/?user={}".format(host, port, user) fs = HadoopFileSystem(uri) assert fs.get_target_stats(FileSelector('/'))
def hdfs(request, hdfs_connection): request.config.pyarrow.requires('hdfs') if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') from pyarrow.fs import HadoopFileSystem host, port, user = hdfs_connection fs = HadoopFileSystem(host, port=port, user=user) return dict( fs=fs, pathfn=lambda p: p, allow_move_dir=True, allow_append_to_file=True, )
def hdfs(request, hdfs_server): request.config.pyarrow.requires('hdfs') if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') from pyarrow.fs import HdfsOptions, HadoopFileSystem host, port, user = hdfs_server options = HdfsOptions(endpoint=(host, port), user=user) fs = HadoopFileSystem(options) return dict( fs=fs, pathfn=lambda p: p, allow_copy_file=False, allow_move_dir=True, allow_append_to_file=True, )
def test_hdfs_options(hdfs_connection): from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') host, port, user = hdfs_connection replication = 2 buffer_size = 64 * 1024 default_block_size = 128 * 1024**2 uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}' '&default_block_size={}') hdfs1 = HadoopFileSystem(host, port, user='******', replication=replication, buffer_size=buffer_size, default_block_size=default_block_size) hdfs2 = HadoopFileSystem.from_uri( uri.format(host, port, 'libhdfs', replication, buffer_size, default_block_size)) hdfs3 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication, buffer_size, default_block_size)) hdfs4 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication + 1, buffer_size, default_block_size)) hdfs5 = HadoopFileSystem(host, port) hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port)) hdfs7 = HadoopFileSystem(host, port, user='******') hdfs8 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path") hdfs9 = HadoopFileSystem(host, port, user='******', kerb_ticket=pathlib.Path("cache_path")) hdfs10 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path2") hdfs11 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path", extra_conf={'hdfs_token': 'abcd'}) assert hdfs1 == hdfs2 assert hdfs5 == hdfs6 assert hdfs6 != hdfs7 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs5 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs8 assert hdfs8 == hdfs9 assert hdfs10 != hdfs9 assert hdfs11 != hdfs8 with pytest.raises(TypeError): HadoopFileSystem() with pytest.raises(TypeError): HadoopFileSystem.from_uri(3) for fs in [ hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9, hdfs10, hdfs11 ]: assert pickle.loads(pickle.dumps(fs)) == fs host, port, user = hdfs_connection hdfs = HadoopFileSystem(host, port, user=user) assert hdfs.get_file_info(FileSelector('/')) hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format( host, port, user)) assert hdfs.get_file_info(FileSelector('/'))
def test_hdfs_options(hdfs_connection): from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') host, port, user = hdfs_connection replication = 2 buffer_size = 64*1024 default_block_size = 128*1024**2 uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}' '&default_block_size={}') hdfs1 = HadoopFileSystem(host, port, user='******', replication=replication, buffer_size=buffer_size, default_block_size=default_block_size) hdfs2 = HadoopFileSystem.from_uri(uri.format( host, port, 'libhdfs', replication, buffer_size, default_block_size )) hdfs3 = HadoopFileSystem.from_uri(uri.format( host, port, 'me', replication, buffer_size, default_block_size )) hdfs4 = HadoopFileSystem.from_uri(uri.format( host, port, 'me', replication + 1, buffer_size, default_block_size )) hdfs5 = HadoopFileSystem(host, port) hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port)) hdfs7 = HadoopFileSystem(host, port, user='******') assert hdfs1 == hdfs2 assert hdfs5 == hdfs6 assert hdfs6 != hdfs7 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs5 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 with pytest.raises(TypeError): HadoopFileSystem() with pytest.raises(TypeError): HadoopFileSystem.from_uri(3) assert pickle.loads(pickle.dumps(hdfs1)) == hdfs1 host, port, user = hdfs_connection hdfs = HadoopFileSystem(host, port, user=user) assert hdfs.get_file_info(FileSelector('/')) hdfs = HadoopFileSystem.from_uri( "hdfs://{}:{}/?user={}".format(host, port, user) ) assert hdfs.get_file_info(FileSelector('/'))
def _get_client(self): if self._client is None: self._client = HadoopFileSystem() return self._client
def _get_client(self): if self._client is None: self._client = HadoopFileSystem(host="default") return self._client