Esempio n. 1
0
 def check_driver(cls):
     if not pa.have_libhdfs():
         message = 'No libhdfs available on system'
         if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'):
             pytest.fail(message)
         else:
             pytest.skip(message)
Esempio n. 2
0
def test_hdfs_options(hdfs_connection):
    from pyarrow.fs import HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    host, port, user = hdfs_connection

    replication = 2
    buffer_size = 64 * 1024
    default_block_size = 128 * 1024**2
    uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}'
           '&default_block_size={}')

    hdfs1 = HadoopFileSystem(host,
                             port,
                             user='******',
                             replication=replication,
                             buffer_size=buffer_size,
                             default_block_size=default_block_size)
    hdfs2 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'libhdfs', replication, buffer_size,
                   default_block_size))
    hdfs3 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication, buffer_size,
                   default_block_size))
    hdfs4 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication + 1, buffer_size,
                   default_block_size))
    hdfs5 = HadoopFileSystem(host, port)
    hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port))
    hdfs7 = HadoopFileSystem(host, port, user='******')

    assert hdfs1 == hdfs2
    assert hdfs5 == hdfs6
    assert hdfs6 != hdfs7
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs5
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    with pytest.raises(TypeError):
        HadoopFileSystem()
    with pytest.raises(TypeError):
        HadoopFileSystem.from_uri(3)

    assert pickle.loads(pickle.dumps(hdfs1)) == hdfs1

    host, port, user = hdfs_connection

    hdfs = HadoopFileSystem(host, port, user=user)
    assert hdfs.get_file_info(FileSelector('/'))

    hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format(
        host, port, user))
    assert hdfs.get_file_info(FileSelector('/'))
Esempio n. 3
0
def hdfs(request, hdfs_connection):
    request.config.pyarrow.requires('hdfs')
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    from pyarrow.fs import HadoopFileSystem

    host, port, user = hdfs_connection
    fs = HadoopFileSystem(host, port=port, user=user)

    return dict(
        fs=fs,
        pathfn=lambda p: p,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
Esempio n. 4
0
def hdfs(request, hdfs_server):
    request.config.pyarrow.requires('hdfs')
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    from pyarrow.fs import HdfsOptions, HadoopFileSystem

    host, port, user = hdfs_server
    options = HdfsOptions(endpoint=(host, port), user=user)

    fs = HadoopFileSystem(options)

    return dict(
        fs=fs,
        pathfn=lambda p: p,
        allow_copy_file=False,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
Esempio n. 5
0
def test_hdfs_options(hdfs_server):
    from pyarrow.fs import HdfsOptions, HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    options = HdfsOptions()
    assert options.endpoint == ('', 0)
    options.endpoint = ('localhost', 8080)
    assert options.endpoint == ('localhost', 8080)
    with pytest.raises(TypeError):
        options.endpoint = 'localhost:8000'

    assert options.driver == 'libhdfs'
    options.driver = 'libhdfs3'
    assert options.driver == 'libhdfs3'
    with pytest.raises(ValueError):
        options.driver = 'unknown'

    assert options.replication == 3
    options.replication = 2
    assert options.replication == 2

    assert options.user == ''
    options.user = '******'
    assert options.user == 'libhdfs'

    assert options.default_block_size == 0
    options.default_block_size = 128*1024**2
    assert options.default_block_size == 128*1024**2

    assert options.buffer_size == 0
    options.buffer_size = 64*1024
    assert options.buffer_size == 64*1024

    options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test')
    assert options.endpoint == ('hdfs://localhost', 8080)
    assert options.user == 'test'

    host, port, user = hdfs_server
    uri = "hdfs://{}:{}/?user={}".format(host, port, user)
    fs = HadoopFileSystem(uri)
    assert fs.get_target_stats(FileSelector('/'))
Esempio n. 6
0
 def check_driver(cls):
     if not pa.have_libhdfs():
         pytest.skip('No libhdfs available on system')
Esempio n. 7
0
def test_hdfs_options(hdfs_connection):
    from pyarrow.fs import HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    host, port, user = hdfs_connection

    replication = 2
    buffer_size = 64 * 1024
    default_block_size = 128 * 1024**2
    uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}'
           '&default_block_size={}')

    hdfs1 = HadoopFileSystem(host,
                             port,
                             user='******',
                             replication=replication,
                             buffer_size=buffer_size,
                             default_block_size=default_block_size)
    hdfs2 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'libhdfs', replication, buffer_size,
                   default_block_size))
    hdfs3 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication, buffer_size,
                   default_block_size))
    hdfs4 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication + 1, buffer_size,
                   default_block_size))
    hdfs5 = HadoopFileSystem(host, port)
    hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port))
    hdfs7 = HadoopFileSystem(host, port, user='******')
    hdfs8 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket="cache_path")
    hdfs9 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket=pathlib.Path("cache_path"))
    hdfs10 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path2")
    hdfs11 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path",
                              extra_conf={'hdfs_token': 'abcd'})

    assert hdfs1 == hdfs2
    assert hdfs5 == hdfs6
    assert hdfs6 != hdfs7
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs5
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs8
    assert hdfs8 == hdfs9
    assert hdfs10 != hdfs9
    assert hdfs11 != hdfs8

    with pytest.raises(TypeError):
        HadoopFileSystem()
    with pytest.raises(TypeError):
        HadoopFileSystem.from_uri(3)

    for fs in [
            hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9,
            hdfs10, hdfs11
    ]:
        assert pickle.loads(pickle.dumps(fs)) == fs

    host, port, user = hdfs_connection

    hdfs = HadoopFileSystem(host, port, user=user)
    assert hdfs.get_file_info(FileSelector('/'))

    hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format(
        host, port, user))
    assert hdfs.get_file_info(FileSelector('/'))
Esempio n. 8
0
 def check_driver(cls):
     if not pa.have_libhdfs():
         pytest.fail('No libhdfs available on system')
Esempio n. 9
0
class MockFS(base.TestCase):
    """Mock FileSystem"""
    def test_io_path_string(self):
        """Test string and equality"""
        self.assertEqual(Path("foo/bar"), "foo/bar")
        self.assertEqual(Path("foo", "bar"), "foo/bar")
        self.assertEqual(Path("hdfs://root", "foo/bar"), "hdfs://root/foo/bar")

    def test_io_path_parent(self):
        """Test parent method"""
        path_localfs = Path("foo", "bar")
        path_hdfs = Path("hdfs://root", "foo", "bar")
        self.assertEqual(path_localfs.parent, "foo")
        self.assertEqual(path_hdfs.parent, "hdfs://root/foo")

    def test_io_path_local(self):
        """Test path write -> read on local file system"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            path = os.path.join(tmpdirname, "test.txt")
            # write a temporary file to local FS
            with Path(path).open("w") as file:
                file.write("test_local")
            # check for contents on local FS
            with Path(path).open() as file:
                self.assertEqual(file.read(), "test_local")

    @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found")
    def test_io_path_hdfs(self):
        """Test path write / read on hdfs"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            path = os.path.join(tmpdirname, "test.txt")
            # write a temporary file to hdfs
            with Path("hdfs://" + path).open("w") as file:
                file.write("test_hdfs")
            # check for contents on hdfs
            with Path("hdfs://" + path).open() as file:
                self.assertEqual(file.read(), "test_hdfs")

    @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found")
    def test_io_copy_file_local_to_hdfs(self):
        """Test path to copy file to hdfs"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            path = Path(tmpdirname, "test.txt")
            # create a local file in a temp folder
            with path.open("w") as file:
                file.write("test_file_local_to_hdfs")
                # copy local dir to hdfs
            path.copy_file("hdfs://" + tmpdirname)
            # check for contents on hdfs
            with Path("hdfs://", path).open() as file:
                self.assertEqual(file.read(), "test_file_local_to_hdfs")

    @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found")
    def test_io_copy_file_hdfs_to_local(self):
        """Test path to copy file to hdfs"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            local_path = Path(tmpdirname, "test.txt")
            hdfs_path = Path("hdfs://" + str(local_path))
            # create a hdfs temp folder with a file
            with hdfs_path.open("w") as file:
                file.write("test_file_hdfs_to_local")
            # copy hdfs to local
            hdfs_path.copy_file(local_path.parent.path)
            # check for contents on hdfs
            with local_path.open() as file:
                self.assertEqual(file.read(), "test_file_hdfs_to_local")

    @skipUnless(pyarrow.have_libhdfs(), "Test Skipped! No LibHDFS found")
    def test_io_copy_dir_local_to_hdfs(self):
        """Test path to move content to hdfs"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            path = Path(tmpdirname, "test.txt")
            # create a local temp folder with a file
            with path.open("w") as file:
                file.write("test_dir_local_to_hdfs")
                # copy local dir to hdfs
                path.parent.copy_dir("hdfs://" + path.parent.path)
            # check for contents on hdfs
            with Path("hdfs://", path).open() as file:
                self.assertEqual(file.read(), "test_dir_local_to_hdfs")

    # @TODO Akbar to find a solution for hadoop glob patterns
    # def test_io_copy_dir_hdfs_to_local(self):
    #     """Test path to move content from hdfs"""
    #     with tempfile.TemporaryDirectory() as tmpdirname:
    #         local_path = Path(tmpdirname, "test.txt")
    #         hdfs_path = Path("hdfs://" + str(local_path))
    #         # create a hdfs temp folder with a file
    #         with hdfs_path.open("w") as file:
    #             file.write("test_dir_hdfs_to_local")
    #             # copy hdfs to local
    #             hdfs_path.parent.copy_dir(local_path.parent.path)
    #         # check for contents on local
    #         with local_path.open() as file:
    #             self.assertEqual(file.read(), "test_dir_hdfs_to_local")

    def test_io_path_local_delete_file(self):
        """Test path write -> read on local file system"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            path = os.path.join(tmpdirname, "test.txt")
            # write a temporary file to local FS
            with Path(path).open("w") as file:
                file.write("test_local")
            # Check file exisits
            self.assertTrue(Path(path).exists())
            # check for contents on local FS
            Path(path).delete()
            self.assertFalse(Path(path).exists())

    def test_io_path_local_delete_dir(self):
        """Test path write -> read on local file system"""
        with tempfile.TemporaryDirectory() as tmpdirname:
            model_id = "12345"
            path = os.path.join(tmpdirname, model_id)
            Path(path).mkdir()
            # write a temporary file to local FS
            with Path(path + "/test.txt").open("w") as file:
                file.write("test_local")
                # Check file exists
            self.assertTrue(Path(path).parent.exists())
            # check for contents on local FS
            Path(path).delete_dir()
            self.assertFalse(Path(path).exists())