Ejemplo n.º 1
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    con = make_ibis_client(ENV)

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data:
        load_impala_data(con, str(data_dir), overwrite)
    else:
        logger.info('Skipping Ibis test data load (--no-data)')

    # build and upload the UDFs
    if udf:
        already_loaded = is_udf_loaded(con)
        logger.info('Attempting to build and load test UDFs')
        if already_loaded and not overwrite:
            logger.info('UDFs already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                logger.info('UDFs already loaded; attempting to overwrite')
            logger.info('Building UDFs')
            build_udfs()
            logger.info('Uploading UDFs')
            upload_udfs(con)
    else:
        logger.info('Skipping UDF build/load (--no-udf)')
Ejemplo n.º 2
0
    def _find_backend(self) -> BaseBackend:
        backends = self._find_backends()

        if not backends:
            default = config.options.default_backend
            if default is None:
                raise IbisError(
                    'Expression depends on no backends, and found no default')
            return default

        if len(backends) > 1:
            raise ValueError('Multiple backends found')

        return backends[0]
Ejemplo n.º 3
0
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_https='default',
                 auth_mechanism='NOSASL',
                 verify=True,
                 session=None,
                 **kwds):
    """Connect to HDFS.

    Parameters
    ----------
    host : str
        Host name of the HDFS NameNode
    port : int
        NameNode's WebHDFS port
    protocol : str,
        The protocol used to communicate with HDFS. The only valid value is
        ``'webhdfs'``.
    use_https : bool
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False.
    auth_mechanism : str
        Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : bool
        Set to :data:`False` to turn off verifying SSL certificates.
    session : Optional[requests.Session]
        A custom :class:`requests.Session` object.

    Notes
    -----
    Other keywords are forwarded to HDFS library classes.

    Returns
    -------
    WebHDFS

    """
    import requests

    if session is None:
        session = requests.Session()
    session.verify = verify
    if auth_mechanism in ('GSSAPI', 'LDAP'):
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos  # noqa: F401
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient

        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, session=session, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient

        url = '{}://{}:{}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, session=session, **kwds)
    return WebHDFS(hdfs_client)
Ejemplo n.º 4
0
 def _check_connected(self):
     if not self.is_connected:
         raise IbisError('Please first connect to a Kudu cluster '
                         'with client.kudu.connect')