def _get_fs_and_protocol(self): storage_options = self.storage_options or {} protocol, path = split_protocol(self.prefix_path) cls = fsspec.get_filesystem_class(protocol) options = cls._get_kwargs_from_urls(self.prefix_path) update_storage_options(options, storage_options) fs = cls(**options) return fs, protocol
def _open_dataset(self): import xarray as xr import fsspec assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6" from fsspec import filesystem, get_mapper from fsspec.utils import update_storage_options, infer_storage_options storage_options = infer_storage_options(self.urlpath) update_storage_options(storage_options, self.storage_options) self._fs = filesystem(storage_options['protocol']) if storage_options['protocol'] != 'file': self._mapper = get_mapper(self.urlpath) self._ds = xr.open_zarr(self._mapper, **self.kwargs) else: self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
def __init__(self, dataset_url, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3', user=None, storage_options=None): """ Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to instantiate a pyarrow filesystem. Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order: 1. If no ``scheme``, no longer supported, so raise an exception! 2. If ``scheme`` is ``file``, use local filesystem path. 3. If ``scheme`` is ``hdfs``: a. Try the ``hostname`` as a namespace and attempt to connect to a name node. 1. If that doesn't work, try connecting directly to namenode ``hostname:port``. b. If no host, connect to the default name node. 5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3 6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS 7. Fail otherwise. :param dataset_url: The hdfs URL or absolute path to the dataset :param hadoop_configuration: an optional hadoop configuration :param connector: the HDFS connector object to use (ONLY override for testing purposes) :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS. None implies login user. :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem. """ # Cache both the original URL and the resolved, urlparsed dataset_url self._dataset_url = dataset_url self._parsed_dataset_url = None # Cache the instantiated filesystem object self._filesystem = None if isinstance(self._dataset_url, six.string_types): self._parsed_dataset_url = urlparse(self._dataset_url) else: self._parsed_dataset_url = self._dataset_url if not self._parsed_dataset_url.scheme: # Case 1 raise ValueError( 'ERROR! A scheme-less dataset url ({}) is no longer supported. ' 'Please prepend "file://" for local filesystem.'.format( self._parsed_dataset_url.scheme)) elif self._parsed_dataset_url.scheme == 'file': # Case 2: definitely local self._filesystem = pyarrow.localfs self._filesystem_factory = lambda: pyarrow.localfs elif self._parsed_dataset_url.scheme == 'hdfs': if hdfs_driver == 'libhdfs3': # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary # if using libhdfs # Obtain singleton and force hadoop config evaluation namenode_resolver = HdfsNamenodeResolver(hadoop_configuration) # Since we can't tell for sure, first treat the URL as though it references a name service if self._parsed_dataset_url.netloc: # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased nameservice = self._parsed_dataset_url.netloc.split(':')[0] namenodes = namenode_resolver.resolve_hdfs_name_service( nameservice) if namenodes: self._filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if self._filesystem is None: # Case 3a1: That didn't work; try the URL as a namenode host self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), user=user) else: # Case 3b: No netloc, so let's try to connect to default namenode # HdfsNamenodeResolver will raise exception if it fails to connect. nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service( ) filesystem = connector.connect_to_either_namenode( namenodes, user=user) self._filesystem_factory = lambda: connector.connect_to_either_namenode( namenodes, user=user) if filesystem is not None: # Properly replace the parsed dataset URL once default namenode is confirmed self._parsed_dataset_url = urlparse( 'hdfs://{}{}'.format( nameservice, self._parsed_dataset_url.path)) self._filesystem = filesystem else: self._filesystem = connector.hdfs_connect_namenode( self._parsed_dataset_url, hdfs_driver, user=user) self._filesystem_factory = \ lambda url=self._dataset_url, user=user: \ connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user) else: # Fallback to fsspec to handle any other schemes if not self._parsed_dataset_url.netloc: raise ValueError( 'URLs must be of the form {}://bucket/path'.format( self._parsed_dataset_url.scheme)) storage_options = storage_options or {} protocol = self._parsed_dataset_url.scheme cls = fsspec.get_filesystem_class(protocol) options = cls._get_kwargs_from_urls(self._dataset_url) update_storage_options(options, storage_options) self._filesystem = cls(**options) self._filesystem_factory = lambda: cls(**options) # pylint: disable=unnecessary-lambda