def _test_default_hdfs_service(mock_hadoop_home_directory, env_var): # Trigger env var evaluation suj = HdfsNamenodeResolver() assert env_var == suj._hadoop_env assert mock_hadoop_home_directory == suj._hadoop_path # List of namenodes returned nominally nameservice, namenodes = suj.resolve_default_hdfs_service() assert HC.WARP_TURTLE == nameservice assert HC.WARP_TURTLE_NN2 == namenodes[0] assert HC.WARP_TURTLE_NN1 == namenodes[1] # Exception raised for badly defined nameservice (XML issue) with pytest.raises(RuntimeError): suj.resolve_hdfs_name_service('foobar') # None for nonexistent nameservice (intentional design) assert suj.resolve_hdfs_name_service('nonexistent') is None
def _test_default_hdfs_service(self, env_var): os.environ[env_var] = self._tmp_dir # Trigger env var evaluation suj = HdfsNamenodeResolver() self.assertEqual(env_var, suj._hadoop_env) self.assertEqual(self._tmp_dir, suj._hadoop_path) # List of namenodes returned nominally nameservice, namenodes = suj.resolve_default_hdfs_service() self.assertEqual(HC.WARP_TURTLE, nameservice) self.assertEqual(HC.WARP_TURTLE_NN2, namenodes[0]) self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[1]) # Exception raised for badly defined nameservice (XML issue) with self.assertRaises(RuntimeError): suj.resolve_hdfs_name_service('foobar') # None for nonexistent nameservice (intentional design) self.assertIsNone(suj.resolve_hdfs_name_service('nonexistent'))
def __init__(self, dataset_url, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3'): """ Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to instantiate a pyarrow filesystem. Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order: 1. If no ``scheme``, no longer supported, so raise an exception! 2. If ``scheme`` is ``file``, use local filesystem path. 3. If ``scheme`` is ``hdfs``: a. Try the ``hostname`` as a namespace and attempt to connect to a name node. 1. If that doesn't work, try connecting directly to namenode ``hostname:port``. b. If no host, connect to the default name node. 5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3 6. Fail otherwise. :param dataset_url: The hdfs URL or absolute path to the dataset :param key: access key of obs :param secret: secret key of obs :param endpoint: endpoint of obs :param proxy: proxy :param proxy_port: proxy_port :param hadoop_configuration: an optional hadoop configuration :param connector: the HDFS connector object to use (ONLY override for testing purposes) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) """ # Cache both the original URL and the resolved, urlparsed dataset_url self._dataset_url = dataset_url self._parsed_dataset_url = None # Cache the instantiated filesystem object self._filesystem = None if isinstance(self._dataset_url, six.string_types): self._parsed_dataset_url = urlparse(self._dataset_url) else: self._parsed_dataset_url = self._dataset_url if not self._parsed_dataset_url.scheme: # Case 1 raise ValueError('ERROR! A scheme-less dataset url ({}) is no longer supported. ' 'Please prepend "file://" for local filesystem.'.format(self._parsed_dataset_url.scheme)) elif self._parsed_dataset_url.scheme == 'file': # Case 2: definitely local self._filesystem = pyarrow.localfs elif self._parsed_dataset_url.scheme == 'hdfs': if hdfs_driver == 'libhdfs3': # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary # if using libhdfs # Obtain singleton and force hadoop config evaluation namenode_resolver = HdfsNamenodeResolver(hadoop_configuration) # Since we can't tell for sure, first treat the URL as though it references a name service if self._parsed_dataset_url.netloc: # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased nameservice = self._parsed_dataset_url.netloc.split(':')[0] namenodes = namenode_resolver.resolve_hdfs_name_service(nameservice) if namenodes: self._filesystem = connector.connect_to_either_namenode(namenodes) if self._filesystem is None: # Case 3a1: That didn't work; try the URL as a namenode host self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url) else: # Case 3b: No netloc, so let's try to connect to default namenode # HdfsNamenodeResolver will raise exception if it fails to connect. nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service() filesystem = connector.connect_to_either_namenode(namenodes) if filesystem is not None: # Properly replace the parsed dataset URL once default namenode is confirmed self._parsed_dataset_url = urlparse( 'hdfs://{}{}'.format(nameservice, self._parsed_dataset_url.path)) self._filesystem = filesystem else: self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url, hdfs_driver) elif self._parsed_dataset_url.scheme == "s3a": # Case 5 # S3 support requires s3fs to be installed try: import s3fs except ImportError: raise ValueError('Must have s3fs installed in order to use datasets on s3. ' 'Please install s3fs and try again.') if not self._parsed_dataset_url.netloc: raise ValueError('URLs must be of the form s3://bucket/path') if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') http_proxy = 'http://' + proxy + ':' + str(proxy_port) if ( proxy is not None and proxy_port is not None) else None https_proxy = 'https://' + proxy + ':' + str(proxy_port) if ( proxy is not None and proxy_port is not None) else None config_kwargs = {'proxies': {'http': http_proxy, 'https': https_proxy}} if ( http_proxy is not None) else None fs = s3fs.S3FileSystem(key=key, secret=secret, client_kwargs={'endpoint_url': endpoint}, config_kwargs=config_kwargs) self._filesystem = pyarrow.filesystem.S3FSWrapper(fs) else: # Case 6 raise ValueError('Unsupported scheme in dataset url {}. ' 'Currently, only "file" and "hdfs" are supported.'.format(self._parsed_dataset_url.scheme))
def __init__(self, dataset_url, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3'): """ Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to instantiate a pyarrow filesystem. Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order: 1. If no ``scheme``, no longer supported, so raise an exception! 2. If ``scheme`` is ``file``, use local filesystem path. 3. If ``scheme`` is ``hdfs``: a. Try the ``hostname`` as a namespace and attempt to connect to a name node. b. If no host, connect to the default name node. 4. Next, try connecting directly to namenode ``hostname:port``. 5. Fail otherwise. :param dataset_url: The hdfs URL or absolute path to the dataset :param hadoop_configuration: an optional hadoop configuration :param connector: the HDFS connector object to use (ONLY override for testing purposes) """ # Cache both the original URL and the resolved, urlparsed dataset_url self._dataset_url = dataset_url self._parsed_dataset_url = None # Cache the instantiated filesystem object self._filesystem = None if isinstance(self._dataset_url, six.string_types): self._parsed_dataset_url = urlparse(self._dataset_url) else: self._parsed_dataset_url = self._dataset_url if not self._parsed_dataset_url.scheme: # Case 1 raise ValueError('ERROR! A scheme-less dataset url ({}) is no longer supported. ' 'Please prepend "file://" for local filesystem.'.format(self._parsed_dataset_url.scheme)) elif self._parsed_dataset_url.scheme == 'file': # Case 2: definitely local self._filesystem = pyarrow.localfs elif self._parsed_dataset_url.scheme == 'hdfs': if hdfs_driver == 'libhdfs3': # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary # if using libhdfs # Obtain singleton and force hadoop config evaluation namenode_resolver = HdfsNamenodeResolver(hadoop_configuration) # Since we can't tell for sure, first treat the URL as though it references a name service if self._parsed_dataset_url.netloc: # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased nameservice = self._parsed_dataset_url.netloc.split(':')[0] namenodes = namenode_resolver.resolve_hdfs_name_service(nameservice) if namenodes: self._filesystem = connector.connect_to_either_namenode(namenodes) if self._filesystem is None: # Case 5: That didn't work; try the URL as a namenode host self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url) else: # Case 3b: No netloc, so let's try to connect to default namenode # HdfsNamenodeResolver will raise exception if it fails to connect. nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service() filesystem = connector.connect_to_either_namenode(namenodes) if filesystem is not None: # Properly replace the parsed dataset URL once default namenode is confirmed self._parsed_dataset_url = urlparse( 'hdfs://{}{}'.format(nameservice, self._parsed_dataset_url.path)) self._filesystem = filesystem else: self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url, hdfs_driver) else: # Case 5 raise ValueError('Unsupported scheme in dataset url {}. ' 'Currently, only "file" and "hdfs" are supported.'.format(self._parsed_dataset_url.scheme))
def __init__(self, dataset_url, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3', user=None, storage_options=None): """ Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to instantiate a pyarrow filesystem. Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order: 1. If no ``scheme``, no longer supported, so raise an exception! 2. If ``scheme`` is ``file``, use local filesystem path. 3. If ``scheme`` is ``hdfs``: a. Try the ``hostname`` as a namespace and attempt to connect to a name node. 1. If that doesn't work, try connecting directly to namenode ``hostname:port``. b. If no host, connect to the default name node. 5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3 6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS 7. Fail otherwise. :param dataset_url: The hdfs URL or absolute path to the dataset :param hadoop_configuration: an optional hadoop configuration :param connector: the HDFS connector object to use (ONLY override for testing purposes) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS. None implies login user. :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem. """ # Cache both the original URL and the resolved, urlparsed dataset_url self._dataset_url = dataset_url self._parsed_dataset_url = None # Cache the instantiated filesystem object self._filesystem = None if isinstance(self._dataset_url, six.string_types): self._parsed_dataset_url = urlparse(self._dataset_url) else: self._parsed_dataset_url = self._dataset_url if not self._parsed_dataset_url.scheme: # Case 1 raise ValueError( 'ERROR! A scheme-less dataset url ({}) is no longer supported. ' 'Please prepend "file://" for local filesystem.'.format( self._parsed_dataset_url.scheme)) elif self._parsed_dataset_url.scheme == 'file': # Case 2: definitely local self._filesystem = pyarrow.localfs self._filesystem_factory = lambda: pyarrow.localfs elif self._parsed_dataset_url.scheme == 'hdfs': if hdfs_driver == 'libhdfs3': # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary # if using libhdfs # Obtain singleton and force hadoop config evaluation namenode_resolver = HdfsNamenodeResolver(hadoop_configuration) # Since we can't tell for sure, first treat the URL as though it references a name service if self._parsed_dataset_url.netloc: # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased nameservice = self._parsed_dataset_url.netloc.split(':')[0] namenodes = namenode_resolver.resolve_hdfs_name_service( nameservice) if namenodes: self._filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if self._filesystem is None: # Case 3a1: That didn't work; try the URL as a namenode host self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), user=user) else: # Case 3b: No netloc, so let's try to connect to default namenode # HdfsNamenodeResolver will raise exception if it fails to connect. nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service( ) filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if filesystem is not None: # Properly replace the parsed dataset URL once default namenode is confirmed self._parsed_dataset_url = urlparse( 'hdfs://{}{}'.format( nameservice, self._parsed_dataset_url.path)) self._filesystem = filesystem else: self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, hdfs_driver, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user) else: # Fallback to fsspec to handle any other schemes if not self._parsed_dataset_url.netloc: raise ValueError( 'URLs must be of the form {}://bucket/path'.format( self._parsed_dataset_url.scheme)) storage_options = storage_options or {} protocol = self._parsed_dataset_url.scheme self._filesystem = fsspec.filesystem(protocol, **storage_options) self._filesystem_factory = lambda: fsspec.filesystem( protocol, **storage_options)
class HdfsNamenodeResolverTest(unittest.TestCase): def setUp(self): """Initializes a mock hadoop config and a namenode resolver instance, for convenience.""" self._hadoop_configuration = MockHadoopConfiguration() self.suj = HdfsNamenodeResolver(self._hadoop_configuration) def test_default_hdfs_service_errors(self): """Check error cases with connecting to default namenode""" # No default yields RuntimeError with self.assertRaises(RuntimeError): self.suj.resolve_default_hdfs_service() # Bad default FS yields IOError self._hadoop_configuration.set('fs.defaultFS', 'invalidFS') with self.assertRaises(IOError): self.suj.resolve_default_hdfs_service() # Random FS host yields IOError self._hadoop_configuration.set('fs.defaultFS', 'hdfs://random') with self.assertRaises(IOError): self.suj.resolve_default_hdfs_service() # Valid FS host with no namenode defined yields IOError self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) with self.assertRaises(IOError): self.suj.resolve_default_hdfs_service() def test_default_hdfs_service_typical(self): """Check typical cases resolving default namenode""" # One nn self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn1') self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn1'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN1) nameservice, namenodes = self.suj.resolve_default_hdfs_service() self.assertEqual(HC.WARP_TURTLE, nameservice) self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[0]) # Second of two nns, when the first is undefined self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn2,nn1') with self.assertRaises(RuntimeError): self.suj.resolve_default_hdfs_service() # Two valid and defined nns self._hadoop_configuration.set( 'dfs.namenode.rpc-address.{}.nn2'.format(HC.WARP_TURTLE), HC.WARP_TURTLE_NN2) nameservice, namenodes = self.suj.resolve_default_hdfs_service() self.assertEqual(HC.WARP_TURTLE, nameservice) self.assertEqual(HC.WARP_TURTLE_NN2, namenodes[0]) self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[1]) def test_resolve_hdfs_name_service(self): """Check edge cases with resolving a nameservice""" # Most cases already covered by test_default_hdfs_service_ok above... # Empty config or no namespace yields None self.assertIsNone( HdfsNamenodeResolver({}).resolve_hdfs_name_service('')) self.assertIsNone(self.suj.resolve_hdfs_name_service('')) # Test a single undefined namenode case, as well as an unconventional multi-NN case; # both result in an exception raised self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE) self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn1') with self.assertRaises(RuntimeError): self.suj.resolve_hdfs_name_service(HC.WARP_TURTLE) # Test multiple undefined NNs, which will also throw HdfsConnectError nns = 'nn1,nn2,nn3,nn4,nn5,nn6,nn7,nn8' self._hadoop_configuration.set( 'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), nns) with self.assertRaises(RuntimeError): self.suj.resolve_hdfs_name_service(HC.WARP_TURTLE)