def check_driver(cls): if not pa.have_libhdfs(): message = 'No libhdfs available on system' if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'): pytest.fail(message) else: pytest.skip(message)
def test_hdfs_options(hdfs_connection): from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') host, port, user = hdfs_connection replication = 2 buffer_size = 64 * 1024 default_block_size = 128 * 1024**2 uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}' '&default_block_size={}') hdfs1 = HadoopFileSystem(host, port, user='******', replication=replication, buffer_size=buffer_size, default_block_size=default_block_size) hdfs2 = HadoopFileSystem.from_uri( uri.format(host, port, 'libhdfs', replication, buffer_size, default_block_size)) hdfs3 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication, buffer_size, default_block_size)) hdfs4 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication + 1, buffer_size, default_block_size)) hdfs5 = HadoopFileSystem(host, port) hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port)) hdfs7 = HadoopFileSystem(host, port, user='******') assert hdfs1 == hdfs2 assert hdfs5 == hdfs6 assert hdfs6 != hdfs7 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs5 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 with pytest.raises(TypeError): HadoopFileSystem() with pytest.raises(TypeError): HadoopFileSystem.from_uri(3) assert pickle.loads(pickle.dumps(hdfs1)) == hdfs1 host, port, user = hdfs_connection hdfs = HadoopFileSystem(host, port, user=user) assert hdfs.get_file_info(FileSelector('/')) hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format( host, port, user)) assert hdfs.get_file_info(FileSelector('/'))
def hdfs(request, hdfs_connection): request.config.pyarrow.requires('hdfs') if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') from pyarrow.fs import HadoopFileSystem host, port, user = hdfs_connection fs = HadoopFileSystem(host, port=port, user=user) return dict( fs=fs, pathfn=lambda p: p, allow_move_dir=True, allow_append_to_file=True, )
def hdfs(request, hdfs_server): request.config.pyarrow.requires('hdfs') if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') from pyarrow.fs import HdfsOptions, HadoopFileSystem host, port, user = hdfs_server options = HdfsOptions(endpoint=(host, port), user=user) fs = HadoopFileSystem(options) return dict( fs=fs, pathfn=lambda p: p, allow_copy_file=False, allow_move_dir=True, allow_append_to_file=True, )
def test_hdfs_options(hdfs_server): from pyarrow.fs import HdfsOptions, HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') options = HdfsOptions() assert options.endpoint == ('', 0) options.endpoint = ('localhost', 8080) assert options.endpoint == ('localhost', 8080) with pytest.raises(TypeError): options.endpoint = 'localhost:8000' assert options.driver == 'libhdfs' options.driver = 'libhdfs3' assert options.driver == 'libhdfs3' with pytest.raises(ValueError): options.driver = 'unknown' assert options.replication == 3 options.replication = 2 assert options.replication == 2 assert options.user == '' options.user = '******' assert options.user == 'libhdfs' assert options.default_block_size == 0 options.default_block_size = 128*1024**2 assert options.default_block_size == 128*1024**2 assert options.buffer_size == 0 options.buffer_size = 64*1024 assert options.buffer_size == 64*1024 options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test') assert options.endpoint == ('hdfs://localhost', 8080) assert options.user == 'test' host, port, user = hdfs_server uri = "hdfs://{}:{}/?user={}".format(host, port, user) fs = HadoopFileSystem(uri) assert fs.get_target_stats(FileSelector('/'))
def check_driver(cls): if not pa.have_libhdfs(): pytest.skip('No libhdfs available on system')
def test_hdfs_options(hdfs_connection): from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') host, port, user = hdfs_connection replication = 2 buffer_size = 64 * 1024 default_block_size = 128 * 1024**2 uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}' '&default_block_size={}') hdfs1 = HadoopFileSystem(host, port, user='******', replication=replication, buffer_size=buffer_size, default_block_size=default_block_size) hdfs2 = HadoopFileSystem.from_uri( uri.format(host, port, 'libhdfs', replication, buffer_size, default_block_size)) hdfs3 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication, buffer_size, default_block_size)) hdfs4 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication + 1, buffer_size, default_block_size)) hdfs5 = HadoopFileSystem(host, port) hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port)) hdfs7 = HadoopFileSystem(host, port, user='******') hdfs8 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path") hdfs9 = HadoopFileSystem(host, port, user='******', kerb_ticket=pathlib.Path("cache_path")) hdfs10 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path2") hdfs11 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path", extra_conf={'hdfs_token': 'abcd'}) assert hdfs1 == hdfs2 assert hdfs5 == hdfs6 assert hdfs6 != hdfs7 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs5 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs8 assert hdfs8 == hdfs9 assert hdfs10 != hdfs9 assert hdfs11 != hdfs8 with pytest.raises(TypeError): HadoopFileSystem() with pytest.raises(TypeError): HadoopFileSystem.from_uri(3) for fs in [ hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9, hdfs10, hdfs11 ]: assert pickle.loads(pickle.dumps(fs)) == fs host, port, user = hdfs_connection hdfs = HadoopFileSystem(host, port, user=user) assert hdfs.get_file_info(FileSelector('/')) hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format( host, port, user)) assert hdfs.get_file_info(FileSelector('/'))
def check_driver(cls): if not pa.have_libhdfs(): pytest.fail('No libhdfs available on system')
class MockFS(base.TestCase): """Mock FileSystem""" def test_io_path_string(self): """Test string and equality""" self.assertEqual(Path("foo/bar"), "foo/bar") self.assertEqual(Path("foo", "bar"), "foo/bar") self.assertEqual(Path("hdfs://root", "foo/bar"), "hdfs://root/foo/bar") def test_io_path_parent(self): """Test parent method""" path_localfs = Path("foo", "bar") path_hdfs = Path("hdfs://root", "foo", "bar") self.assertEqual(path_localfs.parent, "foo") self.assertEqual(path_hdfs.parent, "hdfs://root/foo") def test_io_path_local(self): """Test path write -> read on local file system""" with tempfile.TemporaryDirectory() as tmpdirname: path = os.path.join(tmpdirname, "test.txt") # write a temporary file to local FS with Path(path).open("w") as file: file.write("test_local") # check for contents on local FS with Path(path).open() as file: self.assertEqual(file.read(), "test_local") @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found") def test_io_path_hdfs(self): """Test path write / read on hdfs""" with tempfile.TemporaryDirectory() as tmpdirname: path = os.path.join(tmpdirname, "test.txt") # write a temporary file to hdfs with Path("hdfs://" + path).open("w") as file: file.write("test_hdfs") # check for contents on hdfs with Path("hdfs://" + path).open() as file: self.assertEqual(file.read(), "test_hdfs") @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found") def test_io_copy_file_local_to_hdfs(self): """Test path to copy file to hdfs""" with tempfile.TemporaryDirectory() as tmpdirname: path = Path(tmpdirname, "test.txt") # create a local file in a temp folder with path.open("w") as file: file.write("test_file_local_to_hdfs") # copy local dir to hdfs path.copy_file("hdfs://" + tmpdirname) # check for contents on hdfs with Path("hdfs://", path).open() as file: self.assertEqual(file.read(), "test_file_local_to_hdfs") @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found") def test_io_copy_file_hdfs_to_local(self): """Test path to copy file to hdfs""" with tempfile.TemporaryDirectory() as tmpdirname: local_path = Path(tmpdirname, "test.txt") hdfs_path = Path("hdfs://" + str(local_path)) # create a hdfs temp folder with a file with hdfs_path.open("w") as file: file.write("test_file_hdfs_to_local") # copy hdfs to local hdfs_path.copy_file(local_path.parent.path) # check for contents on hdfs with local_path.open() as file: self.assertEqual(file.read(), "test_file_hdfs_to_local") @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found") def test_io_copy_dir_local_to_hdfs(self): """Test path to move content to hdfs""" with tempfile.TemporaryDirectory() as tmpdirname: path = Path(tmpdirname, "test.txt") # create a local temp folder with a file with path.open("w") as file: file.write("test_dir_local_to_hdfs") # copy local dir to hdfs path.parent.copy_dir("hdfs://" + path.parent.path) # check for contents on hdfs with Path("hdfs://", path).open() as file: self.assertEqual(file.read(), "test_dir_local_to_hdfs") # @TODO Akbar to find a solution for hadoop glob patterns # def test_io_copy_dir_hdfs_to_local(self): # """Test path to move content from hdfs""" # with tempfile.TemporaryDirectory() as tmpdirname: # local_path = Path(tmpdirname, "test.txt") # hdfs_path = Path("hdfs://" + str(local_path)) # # create a hdfs temp folder with a file # with hdfs_path.open("w") as file: # file.write("test_dir_hdfs_to_local") # # copy hdfs to local # hdfs_path.parent.copy_dir(local_path.parent.path) # # check for contents on local # with local_path.open() as file: # self.assertEqual(file.read(), "test_dir_hdfs_to_local") def test_io_path_local_delete_file(self): """Test path write -> read on local file system""" with tempfile.TemporaryDirectory() as tmpdirname: path = os.path.join(tmpdirname, "test.txt") # write a temporary file to local FS with Path(path).open("w") as file: file.write("test_local") # Check file exisits self.assertTrue(Path(path).exists()) # check for contents on local FS Path(path).delete() self.assertFalse(Path(path).exists()) def test_io_path_local_delete_dir(self): """Test path write -> read on local file system""" with tempfile.TemporaryDirectory() as tmpdirname: model_id = "12345" path = os.path.join(tmpdirname, model_id) Path(path).mkdir() # write a temporary file to local FS with Path(path + "/test.txt").open("w") as file: file.write("test_local") # Check file exists self.assertTrue(Path(path).parent.exists()) # check for contents on local FS Path(path).delete_dir() self.assertFalse(Path(path).exists())