Exemple #1
0
 def _get_fs_and_protocol(self):
     storage_options = self.storage_options or {}
     protocol, path = split_protocol(self.prefix_path)
     cls = fsspec.get_filesystem_class(protocol)
     options = cls._get_kwargs_from_urls(self.prefix_path)
     update_storage_options(options, storage_options)
     fs = cls(**options)
     return fs, protocol
Exemple #2
0
    def _open_dataset(self):
        import xarray as xr
        import fsspec
        assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6"
        from fsspec import filesystem, get_mapper
        from fsspec.utils import update_storage_options, infer_storage_options

        storage_options = infer_storage_options(self.urlpath)
        update_storage_options(storage_options, self.storage_options)
        self._fs = filesystem(storage_options['protocol'])
        if storage_options['protocol'] != 'file':
            self._mapper = get_mapper(self.urlpath)
            self._ds = xr.open_zarr(self._mapper, **self.kwargs)
        else:
            self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
Exemple #3
0
    def __init__(self,
                 dataset_url,
                 hadoop_configuration=None,
                 connector=HdfsConnector,
                 hdfs_driver='libhdfs3',
                 user=None,
                 storage_options=None):
        """
        Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to
        instantiate a pyarrow filesystem.

        Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order:

        1. If no ``scheme``, no longer supported, so raise an exception!
        2. If ``scheme`` is ``file``, use local filesystem path.
        3. If ``scheme`` is ``hdfs``:
           a. Try the ``hostname`` as a namespace and attempt to connect to a name node.
              1. If that doesn't work, try connecting directly to namenode ``hostname:port``.
           b. If no host, connect to the default name node.
        5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3
        6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS
        7. Fail otherwise.

        :param dataset_url: The hdfs URL or absolute path to the dataset
        :param hadoop_configuration: an optional hadoop configuration
        :param connector: the HDFS connector object to use (ONLY override for testing purposes)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        :param user: String denoting username when connecting to HDFS. None implies login user.
        :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem.
        """
        # Cache both the original URL and the resolved, urlparsed dataset_url
        self._dataset_url = dataset_url
        self._parsed_dataset_url = None
        # Cache the instantiated filesystem object
        self._filesystem = None

        if isinstance(self._dataset_url, six.string_types):
            self._parsed_dataset_url = urlparse(self._dataset_url)
        else:
            self._parsed_dataset_url = self._dataset_url

        if not self._parsed_dataset_url.scheme:
            # Case 1
            raise ValueError(
                'ERROR! A scheme-less dataset url ({}) is no longer supported. '
                'Please prepend "file://" for local filesystem.'.format(
                    self._parsed_dataset_url.scheme))

        elif self._parsed_dataset_url.scheme == 'file':
            # Case 2: definitely local
            self._filesystem = pyarrow.localfs
            self._filesystem_factory = lambda: pyarrow.localfs

        elif self._parsed_dataset_url.scheme == 'hdfs':

            if hdfs_driver == 'libhdfs3':
                # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary
                # if using libhdfs

                # Obtain singleton and force hadoop config evaluation
                namenode_resolver = HdfsNamenodeResolver(hadoop_configuration)

                # Since we can't tell for sure, first treat the URL as though it references a name service
                if self._parsed_dataset_url.netloc:
                    # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased
                    nameservice = self._parsed_dataset_url.netloc.split(':')[0]
                    namenodes = namenode_resolver.resolve_hdfs_name_service(
                        nameservice)
                    if namenodes:
                        self._filesystem = connector.connect_to_either_namenode(
                            namenodes, user=user)
                        self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                            namenodes, user=user)
                    if self._filesystem is None:
                        # Case 3a1: That didn't work; try the URL as a namenode host
                        self._filesystem = connector.hdfs_connect_namenode(
                            self._parsed_dataset_url, user=user)
                        self._filesystem_factory = \
                            lambda url=self._dataset_url, user=user: \
                            connector.hdfs_connect_namenode(urlparse(url), user=user)
                else:
                    # Case 3b: No netloc, so let's try to connect to default namenode
                    # HdfsNamenodeResolver will raise exception if it fails to connect.
                    nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service(
                    )
                    filesystem = connector.connect_to_either_namenode(
                        namenodes, user=user)
                    self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                        namenodes, user=user)
                    if filesystem is not None:
                        # Properly replace the parsed dataset URL once default namenode is confirmed
                        self._parsed_dataset_url = urlparse(
                            'hdfs://{}{}'.format(
                                nameservice, self._parsed_dataset_url.path))
                        self._filesystem = filesystem
            else:
                self._filesystem = connector.hdfs_connect_namenode(
                    self._parsed_dataset_url, hdfs_driver, user=user)
                self._filesystem_factory = \
                    lambda url=self._dataset_url, user=user: \
                    connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user)
        else:
            # Fallback to fsspec to handle any other schemes
            if not self._parsed_dataset_url.netloc:
                raise ValueError(
                    'URLs must be of the form {}://bucket/path'.format(
                        self._parsed_dataset_url.scheme))

            storage_options = storage_options or {}
            protocol = self._parsed_dataset_url.scheme
            cls = fsspec.get_filesystem_class(protocol)
            options = cls._get_kwargs_from_urls(self._dataset_url)
            update_storage_options(options, storage_options)
            self._filesystem = cls(**options)
            self._filesystem_factory = lambda: cls(**options)  # pylint: disable=unnecessary-lambda