Python HdfsNamenodeResolver.resolve_hdfs_name_serviceの例

プログラミング言語: Python

名前空間/パッケージ名: petastorm.hdfs.namenode

メソッド/関数: resolve_hdfs_name_service

hotexamples.comのコード掲載数: 6

Python HdfsNamenodeResolver.resolve_hdfs_name_service - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpetastorm.hdfs.namenode.HdfsNamenodeResolver.resolve_hdfs_name_serviceの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

HdfsNamenodeResolver(15)

resolve_default_hdfs_service(7)

resolve_hdfs_name_service(6)

コード例 #1

ファイルを表示

ファイル: test_hdfs_namenode.py プロジェクト: voganrc/petastorm

def _test_default_hdfs_service(mock_hadoop_home_directory, env_var):
    # Trigger env var evaluation
    suj = HdfsNamenodeResolver()
    assert env_var == suj._hadoop_env
    assert mock_hadoop_home_directory == suj._hadoop_path
    # List of namenodes returned nominally
    nameservice, namenodes = suj.resolve_default_hdfs_service()
    assert HC.WARP_TURTLE == nameservice
    assert HC.WARP_TURTLE_NN2 == namenodes[0]
    assert HC.WARP_TURTLE_NN1 == namenodes[1]
    # Exception raised for badly defined nameservice (XML issue)
    with pytest.raises(RuntimeError):
        suj.resolve_hdfs_name_service('foobar')
    # None for nonexistent nameservice (intentional design)
    assert suj.resolve_hdfs_name_service('nonexistent') is None

コード例 #2

ファイルを表示

 def _test_default_hdfs_service(self, env_var):
     os.environ[env_var] = self._tmp_dir
     # Trigger env var evaluation
     suj = HdfsNamenodeResolver()
     self.assertEqual(env_var, suj._hadoop_env)
     self.assertEqual(self._tmp_dir, suj._hadoop_path)
     # List of namenodes returned nominally
     nameservice, namenodes = suj.resolve_default_hdfs_service()
     self.assertEqual(HC.WARP_TURTLE, nameservice)
     self.assertEqual(HC.WARP_TURTLE_NN2, namenodes[0])
     self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[1])
     # Exception raised for badly defined nameservice (XML issue)
     with self.assertRaises(RuntimeError):
         suj.resolve_hdfs_name_service('foobar')
     # None for nonexistent nameservice (intentional design)
     self.assertIsNone(suj.resolve_hdfs_name_service('nonexistent'))

コード例 #3

ファイルを表示

ファイル: carbon_fs_utils.py プロジェクト: NamanRastogi/pycarbon_carbonlake

  def __init__(self, dataset_url, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None,
               hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3'):
    """
    Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to
    instantiate a pyarrow filesystem.

    Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order:

    1. If no ``scheme``, no longer supported, so raise an exception!
    2. If ``scheme`` is ``file``, use local filesystem path.
    3. If ``scheme`` is ``hdfs``:
       a. Try the ``hostname`` as a namespace and attempt to connect to a name node.
          1. If that doesn't work, try connecting directly to namenode ``hostname:port``.
       b. If no host, connect to the default name node.
    5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3
    6. Fail otherwise.

    :param dataset_url: The hdfs URL or absolute path to the dataset
    :param key: access key of obs
    :param secret: secret key of obs
    :param endpoint: endpoint of obs
    :param proxy: proxy
    :param proxy_port:  proxy_port
    :param hadoop_configuration: an optional hadoop configuration
    :param connector: the HDFS connector object to use (ONLY override for testing purposes)
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
    libhdfs (java through JNI) or libhdfs3 (C++)
    """
    # Cache both the original URL and the resolved, urlparsed dataset_url
    self._dataset_url = dataset_url
    self._parsed_dataset_url = None
    # Cache the instantiated filesystem object
    self._filesystem = None

    if isinstance(self._dataset_url, six.string_types):
      self._parsed_dataset_url = urlparse(self._dataset_url)
    else:
      self._parsed_dataset_url = self._dataset_url

    if not self._parsed_dataset_url.scheme:
      # Case 1
      raise ValueError('ERROR! A scheme-less dataset url ({}) is no longer supported. '
                       'Please prepend "file://" for local filesystem.'.format(self._parsed_dataset_url.scheme))

    elif self._parsed_dataset_url.scheme == 'file':
      # Case 2: definitely local
      self._filesystem = pyarrow.localfs

    elif self._parsed_dataset_url.scheme == 'hdfs':

      if hdfs_driver == 'libhdfs3':
        # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary
        # if using libhdfs

        # Obtain singleton and force hadoop config evaluation
        namenode_resolver = HdfsNamenodeResolver(hadoop_configuration)

        # Since we can't tell for sure, first treat the URL as though it references a name service
        if self._parsed_dataset_url.netloc:
          # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased
          nameservice = self._parsed_dataset_url.netloc.split(':')[0]
          namenodes = namenode_resolver.resolve_hdfs_name_service(nameservice)
          if namenodes:
            self._filesystem = connector.connect_to_either_namenode(namenodes)
          if self._filesystem is None:
            # Case 3a1: That didn't work; try the URL as a namenode host
            self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url)
        else:
          # Case 3b: No netloc, so let's try to connect to default namenode
          # HdfsNamenodeResolver will raise exception if it fails to connect.
          nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service()
          filesystem = connector.connect_to_either_namenode(namenodes)
          if filesystem is not None:
            # Properly replace the parsed dataset URL once default namenode is confirmed
            self._parsed_dataset_url = urlparse(
              'hdfs://{}{}'.format(nameservice, self._parsed_dataset_url.path))
            self._filesystem = filesystem
      else:
        self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url, hdfs_driver)

    elif self._parsed_dataset_url.scheme == "s3a":
      # Case 5
      # S3 support requires s3fs to be installed
      try:
        import s3fs
      except ImportError:
        raise ValueError('Must have s3fs installed in order to use datasets on s3. '
                         'Please install s3fs and try again.')

      if not self._parsed_dataset_url.netloc:
        raise ValueError('URLs must be of the form s3://bucket/path')

      if key is None or secret is None or endpoint is None:
        raise ValueError('key, secret, endpoint should not be None')

      http_proxy = 'http://' + proxy + ':' + str(proxy_port) if (
          proxy is not None and proxy_port is not None) else None

      https_proxy = 'https://' + proxy + ':' + str(proxy_port) if (
          proxy is not None and proxy_port is not None) else None

      config_kwargs = {'proxies': {'http': http_proxy, 'https': https_proxy}} if (
          http_proxy is not None) else None

      fs = s3fs.S3FileSystem(key=key,
                             secret=secret,
                             client_kwargs={'endpoint_url': endpoint},
                             config_kwargs=config_kwargs)

      self._filesystem = pyarrow.filesystem.S3FSWrapper(fs)

    else:
      # Case 6
      raise ValueError('Unsupported scheme in dataset url {}. '
                       'Currently, only "file" and "hdfs" are supported.'.format(self._parsed_dataset_url.scheme))

コード例 #4

ファイルを表示

ファイル: fs_utils.py プロジェクト: shafiahmed/petastorm

    def __init__(self, dataset_url, hadoop_configuration=None, connector=HdfsConnector, hdfs_driver='libhdfs3'):
        """
        Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to
        instantiate a pyarrow filesystem.

        Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order:

        1. If no ``scheme``, no longer supported, so raise an exception!
        2. If ``scheme`` is ``file``, use local filesystem path.
        3. If ``scheme`` is ``hdfs``:
           a. Try the ``hostname`` as a namespace and attempt to connect to a name node.
           b. If no host, connect to the default name node.
        4. Next, try connecting directly to namenode ``hostname:port``.
        5. Fail otherwise.

        :param dataset_url: The hdfs URL or absolute path to the dataset
        :param hadoop_configuration: an optional hadoop configuration
        :param connector: the HDFS connector object to use (ONLY override for testing purposes)
        """
        # Cache both the original URL and the resolved, urlparsed dataset_url
        self._dataset_url = dataset_url
        self._parsed_dataset_url = None
        # Cache the instantiated filesystem object
        self._filesystem = None

        if isinstance(self._dataset_url, six.string_types):
            self._parsed_dataset_url = urlparse(self._dataset_url)
        else:
            self._parsed_dataset_url = self._dataset_url

        if not self._parsed_dataset_url.scheme:
            # Case 1
            raise ValueError('ERROR! A scheme-less dataset url ({}) is no longer supported. '
                             'Please prepend "file://" for local filesystem.'.format(self._parsed_dataset_url.scheme))

        elif self._parsed_dataset_url.scheme == 'file':
            # Case 2: definitely local
            self._filesystem = pyarrow.localfs

        elif self._parsed_dataset_url.scheme == 'hdfs':

            if hdfs_driver == 'libhdfs3':
                # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary
                # if using libhdfs

                # Obtain singleton and force hadoop config evaluation
                namenode_resolver = HdfsNamenodeResolver(hadoop_configuration)

                # Since we can't tell for sure, first treat the URL as though it references a name service
                if self._parsed_dataset_url.netloc:
                    # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased
                    nameservice = self._parsed_dataset_url.netloc.split(':')[0]
                    namenodes = namenode_resolver.resolve_hdfs_name_service(nameservice)
                    if namenodes:
                        self._filesystem = connector.connect_to_either_namenode(namenodes)
                    if self._filesystem is None:
                        # Case 5: That didn't work; try the URL as a namenode host
                        self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url)
                else:
                    # Case 3b: No netloc, so let's try to connect to default namenode
                    # HdfsNamenodeResolver will raise exception if it fails to connect.
                    nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service()
                    filesystem = connector.connect_to_either_namenode(namenodes)
                    if filesystem is not None:
                        # Properly replace the parsed dataset URL once default namenode is confirmed
                        self._parsed_dataset_url = urlparse(
                            'hdfs://{}{}'.format(nameservice, self._parsed_dataset_url.path))
                        self._filesystem = filesystem
            else:
                self._filesystem = connector.hdfs_connect_namenode(self._parsed_dataset_url, hdfs_driver)

        else:
            # Case 5
            raise ValueError('Unsupported scheme in dataset url {}. '
                             'Currently, only "file" and "hdfs" are supported.'.format(self._parsed_dataset_url.scheme))

コード例 #5

ファイルを表示

ファイル: fs_utils.py プロジェクト: canvasml/petastorm

    def __init__(self,
                 dataset_url,
                 hadoop_configuration=None,
                 connector=HdfsConnector,
                 hdfs_driver='libhdfs3',
                 user=None,
                 storage_options=None):
        """
        Given a dataset URL and an optional hadoop configuration, parse and interpret the URL to
        instantiate a pyarrow filesystem.

        Interpretation of the URL ``scheme://hostname:port/path`` occurs in the following order:

        1. If no ``scheme``, no longer supported, so raise an exception!
        2. If ``scheme`` is ``file``, use local filesystem path.
        3. If ``scheme`` is ``hdfs``:
           a. Try the ``hostname`` as a namespace and attempt to connect to a name node.
              1. If that doesn't work, try connecting directly to namenode ``hostname:port``.
           b. If no host, connect to the default name node.
        5. If ``scheme`` is ``s3``, use s3fs. The user must manually install s3fs before using s3
        6. If ``scheme`` is ``gs``or ``gcs``, use gcsfs. The user must manually install gcsfs before using GCS
        7. Fail otherwise.

        :param dataset_url: The hdfs URL or absolute path to the dataset
        :param hadoop_configuration: an optional hadoop configuration
        :param connector: the HDFS connector object to use (ONLY override for testing purposes)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        :param user: String denoting username when connecting to HDFS. None implies login user.
        :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem.
        """
        # Cache both the original URL and the resolved, urlparsed dataset_url
        self._dataset_url = dataset_url
        self._parsed_dataset_url = None
        # Cache the instantiated filesystem object
        self._filesystem = None

        if isinstance(self._dataset_url, six.string_types):
            self._parsed_dataset_url = urlparse(self._dataset_url)
        else:
            self._parsed_dataset_url = self._dataset_url

        if not self._parsed_dataset_url.scheme:
            # Case 1
            raise ValueError(
                'ERROR! A scheme-less dataset url ({}) is no longer supported. '
                'Please prepend "file://" for local filesystem.'.format(
                    self._parsed_dataset_url.scheme))

        elif self._parsed_dataset_url.scheme == 'file':
            # Case 2: definitely local
            self._filesystem = pyarrow.localfs
            self._filesystem_factory = lambda: pyarrow.localfs

        elif self._parsed_dataset_url.scheme == 'hdfs':

            if hdfs_driver == 'libhdfs3':
                # libhdfs3 does not do any namenode resolution itself so we do it manually. This is not necessary
                # if using libhdfs

                # Obtain singleton and force hadoop config evaluation
                namenode_resolver = HdfsNamenodeResolver(hadoop_configuration)

                # Since we can't tell for sure, first treat the URL as though it references a name service
                if self._parsed_dataset_url.netloc:
                    # Case 3a: Use the portion of netloc before any port, which doesn't get lowercased
                    nameservice = self._parsed_dataset_url.netloc.split(':')[0]
                    namenodes = namenode_resolver.resolve_hdfs_name_service(
                        nameservice)
                    if namenodes:
                        self._filesystem = connector.connect_to_either_namenode(
                            namenodes, user=user)
                        self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                            namenodes, user=user)
                    if self._filesystem is None:
                        # Case 3a1: That didn't work; try the URL as a namenode host
                        self._filesystem = connector.hdfs_connect_namenode(
                            self._parsed_dataset_url, user=user)
                        self._filesystem_factory = \
                            lambda url=self._dataset_url, user=user: \
                            connector.hdfs_connect_namenode(urlparse(url), user=user)
                else:
                    # Case 3b: No netloc, so let's try to connect to default namenode
                    # HdfsNamenodeResolver will raise exception if it fails to connect.
                    nameservice, namenodes = namenode_resolver.resolve_default_hdfs_service(
                    )
                    filesystem = connector.connect_to_either_namenode(
                        namenodes, user=user)
                    self._filesystem_factory = lambda: connector.connect_to_either_namenode(
                        namenodes, user=user)
                    if filesystem is not None:
                        # Properly replace the parsed dataset URL once default namenode is confirmed
                        self._parsed_dataset_url = urlparse(
                            'hdfs://{}{}'.format(
                                nameservice, self._parsed_dataset_url.path))
                        self._filesystem = filesystem
            else:
                self._filesystem = connector.hdfs_connect_namenode(
                    self._parsed_dataset_url, hdfs_driver, user=user)
                self._filesystem_factory = \
                    lambda url=self._dataset_url, user=user: \
                    connector.hdfs_connect_namenode(urlparse(url), hdfs_driver, user=user)
        else:
            # Fallback to fsspec to handle any other schemes
            if not self._parsed_dataset_url.netloc:
                raise ValueError(
                    'URLs must be of the form {}://bucket/path'.format(
                        self._parsed_dataset_url.scheme))

            storage_options = storage_options or {}
            protocol = self._parsed_dataset_url.scheme
            self._filesystem = fsspec.filesystem(protocol, **storage_options)
            self._filesystem_factory = lambda: fsspec.filesystem(
                protocol, **storage_options)

コード例 #6

ファイルを表示

ファイル: test_hdfs_namenode.py プロジェクト: voganrc/petastorm

class HdfsNamenodeResolverTest(unittest.TestCase):
    def setUp(self):
        """Initializes a mock hadoop config and a namenode resolver instance, for convenience."""
        self._hadoop_configuration = MockHadoopConfiguration()
        self.suj = HdfsNamenodeResolver(self._hadoop_configuration)

    def test_default_hdfs_service_errors(self):
        """Check error cases with connecting to default namenode"""
        # No default yields RuntimeError
        with self.assertRaises(RuntimeError):
            self.suj.resolve_default_hdfs_service()
        # Bad default FS yields IOError
        self._hadoop_configuration.set('fs.defaultFS', 'invalidFS')
        with self.assertRaises(IOError):
            self.suj.resolve_default_hdfs_service()
        # Random FS host yields IOError
        self._hadoop_configuration.set('fs.defaultFS', 'hdfs://random')
        with self.assertRaises(IOError):
            self.suj.resolve_default_hdfs_service()
        # Valid FS host with no namenode defined yields IOError
        self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE)
        with self.assertRaises(IOError):
            self.suj.resolve_default_hdfs_service()

    def test_default_hdfs_service_typical(self):
        """Check typical cases resolving default namenode"""
        # One nn
        self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE)
        self._hadoop_configuration.set(
            'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn1')
        self._hadoop_configuration.set(
            'dfs.namenode.rpc-address.{}.nn1'.format(HC.WARP_TURTLE),
            HC.WARP_TURTLE_NN1)
        nameservice, namenodes = self.suj.resolve_default_hdfs_service()
        self.assertEqual(HC.WARP_TURTLE, nameservice)
        self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[0])

        # Second of two nns, when the first is undefined
        self._hadoop_configuration.set(
            'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn2,nn1')
        with self.assertRaises(RuntimeError):
            self.suj.resolve_default_hdfs_service()

        # Two valid and defined nns
        self._hadoop_configuration.set(
            'dfs.namenode.rpc-address.{}.nn2'.format(HC.WARP_TURTLE),
            HC.WARP_TURTLE_NN2)
        nameservice, namenodes = self.suj.resolve_default_hdfs_service()
        self.assertEqual(HC.WARP_TURTLE, nameservice)
        self.assertEqual(HC.WARP_TURTLE_NN2, namenodes[0])
        self.assertEqual(HC.WARP_TURTLE_NN1, namenodes[1])

    def test_resolve_hdfs_name_service(self):
        """Check edge cases with resolving a nameservice"""
        # Most cases already covered by test_default_hdfs_service_ok above...
        # Empty config or no namespace yields None
        self.assertIsNone(
            HdfsNamenodeResolver({}).resolve_hdfs_name_service(''))
        self.assertIsNone(self.suj.resolve_hdfs_name_service(''))

        # Test a single undefined namenode case, as well as an unconventional multi-NN case;
        # both result in an exception raised
        self._hadoop_configuration.set('fs.defaultFS', HC.FS_WARP_TURTLE)
        self._hadoop_configuration.set(
            'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), 'nn1')
        with self.assertRaises(RuntimeError):
            self.suj.resolve_hdfs_name_service(HC.WARP_TURTLE)

        # Test multiple undefined NNs, which will also throw HdfsConnectError
        nns = 'nn1,nn2,nn3,nn4,nn5,nn6,nn7,nn8'
        self._hadoop_configuration.set(
            'dfs.ha.namenodes.{}'.format(HC.WARP_TURTLE), nns)
        with self.assertRaises(RuntimeError):
            self.suj.resolve_hdfs_name_service(HC.WARP_TURTLE)