Ejemplo n.º 1
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    print(str(ENV))

    con = make_ibis_client()

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data and (overwrite or not is_data_loaded(con)):
        try:
            tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')
            if not data_dir:
                print('Did not specify a local dir with the test data, so '
                      'downloading it from S3')
                data_dir = dnload_ibis_test_data_from_s3(tmp_dir)
            upload_ibis_test_data_to_hdfs(con, data_dir)
            create_test_database(con)
            parquet_tables = create_parquet_tables(con)
            avro_tables = create_avro_tables(con)
            for table in parquet_tables + avro_tables:
                print('Computing stats for {0}'.format(table.op().name))
                table.compute_stats()
        finally:
            shutil.rmtree(tmp_dir)

    # build and upload the UDFs
    if udf and (overwrite or not is_udf_loaded(con)):
        build_udfs()
        upload_udfs(con)
Ejemplo n.º 2
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    con = make_ibis_client(ENV)

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data:
        load_impala_data(con, str(data_dir), overwrite)
    else:
        logger.info('Skipping Ibis test data load (--no-data)')

    # build and upload the UDFs
    if udf:
        already_loaded = is_udf_loaded(con)
        logger.info('Attempting to build and load test UDFs')
        if already_loaded and not overwrite:
            logger.info('UDFs already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                logger.info('UDFs already loaded; attempting to overwrite')
            logger.info('Building UDFs')
            build_udfs()
            logger.info('Uploading UDFs')
            upload_udfs(con)
    else:
        logger.info('Skipping UDF build/load (--no-udf)')
Ejemplo n.º 3
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    print(str(ENV))

    con = make_ibis_client()

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data:
        already_loaded = is_data_loaded(con)
        print('Attempting to load Ibis test data (--data)')
        if already_loaded and not overwrite:
            print('Data is already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                print('Data is already loaded; attempting to overwrite')
            tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')
            try:
                if not data_dir:
                    print('Did not specify a local dir with the test data, so '
                          'downloading it from S3')
                    data_dir = dnload_ibis_test_data_from_s3(tmp_dir)
                print('Uploading to HDFS')
                upload_ibis_test_data_to_hdfs(con, data_dir)
                print('Creating Ibis test data database')
                create_test_database(con)
                parquet_tables = create_parquet_tables(con)
                avro_tables = create_avro_tables(con)
                for table in parquet_tables + avro_tables:
                    print('Computing stats for {0}'.format(table.op().name))
                    table.compute_stats()

                # sqlite database
                sqlite_src = osp.join(data_dir, 'ibis_testing.db')
                shutil.copy(sqlite_src, '.')
            finally:
                shutil.rmtree(tmp_dir)
    else:
        print('Skipping Ibis test data load (--no-data)')

    # build and upload the UDFs
    if udf:
        already_loaded = is_udf_loaded(con)
        print('Attempting to build and load test UDFs')
        if already_loaded and not overwrite:
            print('UDFs already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                print('UDFs already loaded; attempting to overwrite')
            print('Building UDFs')
            build_udfs()
            print('Uploading UDFs')
            upload_udfs(con)
    else:
        print('Skipping UDF build/load (--no-udf)')
Ejemplo n.º 4
0
def create(create_tarball, push_to_s3):
    """Create Ibis test data"""
    print(str(ENV))

    con = make_ibis_client()

    # verify some assumptions before proceeding
    if push_to_s3 and not create_tarball:
        raise IbisError(
            "Must specify --create-tarball if specifying --push-to-s3")
    if os.path.exists(IBIS_TEST_DATA_LOCAL_DIR):
        raise IbisError(
            'Local dir {} already exists; please remove it first'.format(
                IBIS_TEST_DATA_LOCAL_DIR))
    if not con.exists_database('tpch'):
        raise IbisError('`tpch` database does not exist')
    if not con.hdfs.exists('/test-warehouse/tpch.region_avro'):
        raise IbisError(
            'HDFS dir /test-warehouse/tpch.region_avro does not exist')

    # generate tmp identifiers
    tmp_db_hdfs_path = os.path.join(ENV.tmp_dir, guid())
    tmp_db = guid()
    os.mkdir(IBIS_TEST_DATA_LOCAL_DIR)
    try:
        # create the tmp data locally
        con.create_database(tmp_db, path=tmp_db_hdfs_path)
        print('Created database {} at {}'.format(tmp_db, tmp_db_hdfs_path))

        # create the local data set
        scrape_parquet_files(tmp_db, con)
        download_parquet_files(con, tmp_db_hdfs_path)
        download_avro_files(con)
        generate_csv_files()

        # Only populate SQLite here
        engines = [get_sqlite_engine()]
        load_sql_databases(con, engines)
    finally:
        con.drop_database(tmp_db, force=True)
        assert not con.hdfs.exists(tmp_db_hdfs_path)

    if create_tarball:
        check_call('tar -zc {} > {}'.format(IBIS_TEST_DATA_LOCAL_DIR,
                                            TARBALL_NAME),
                   shell=True)

    if push_to_s3:
        import boto
        s3_conn = boto.connect_s3(IBIS_TEST_AWS_KEY_ID, IBIS_TEST_AWS_SECRET)
        bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET)
        # copy_tarball_to_versioned_backup(bucket)
        key = bucket.new_key(IBIS_TEST_DATA_TARBALL)
        print('Upload tarball to S3')
        key.set_contents_from_filename(TARBALL_NAME, replace=True)
Ejemplo n.º 5
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    print(str(ENV))

    con = make_ibis_client()

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data:
        tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')

        if not data_dir:
            # TODO(wesm): do not download if already downloaded
            print('Did not specify a local dir with the test data, so '
                  'downloading it from S3')
            data_dir = dnload_ibis_test_data_from_s3(tmp_dir)
        try:
            load_impala_data(con, data_dir, overwrite)

            # sqlite database
            print('Setting up SQLite')
            sqlite_src = os.path.join(data_dir, 'ibis_testing.db')
            shutil.copy(sqlite_src, '.')

            print('Loading SQL engines')
            # SQL engines
            engines = [get_postgres_engine()]
            load_sql_databases(con, engines)
        finally:
            shutil.rmtree(tmp_dir)
    else:
        print('Skipping Ibis test data load (--no-data)')

    # build and upload the UDFs
    if udf:
        already_loaded = is_udf_loaded(con)
        print('Attempting to build and load test UDFs')
        if already_loaded and not overwrite:
            print('UDFs already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                print('UDFs already loaded; attempting to overwrite')
            print('Building UDFs')
            build_udfs()
            print('Uploading UDFs')
            upload_udfs(con)
    else:
        print('Skipping UDF build/load (--no-udf)')
Ejemplo n.º 6
0
def create(create_tarball, push_to_s3):
    """Create Ibis test data"""
    print(str(ENV))

    con = make_ibis_client()

    # verify some assumptions before proceeding
    if push_to_s3 and not create_tarball:
        raise IbisError(
            "Must specify --create-tarball if specifying --push-to-s3")
    if osp.exists(IBIS_TEST_DATA_LOCAL_DIR):
        raise IbisError(
            'Local dir {0} already exists; please remove it first'.format(
                IBIS_TEST_DATA_LOCAL_DIR))
    if not con.exists_database('tpch'):
        raise IbisError('`tpch` database does not exist')
    if not con.hdfs.exists('/test-warehouse/tpch.region_avro'):
        raise IbisError(
            'HDFS dir /test-warehouse/tpch.region_avro does not exist')

    # generate tmp identifiers
    tmp_db_hdfs_path = pjoin(ENV.tmp_dir, guid())
    tmp_db = guid()
    os.mkdir(IBIS_TEST_DATA_LOCAL_DIR)
    try:
        # create the tmp data locally
        con.create_database(tmp_db, path=tmp_db_hdfs_path)
        print('Created database {0} at {1}'.format(tmp_db, tmp_db_hdfs_path))

        # create the local data set
        scrape_parquet_files(con)
        download_parquet_files(con, tmp_db_hdfs_path)
        download_avro_files(con)
        generate_csv_files()
    finally:
        con.drop_database(tmp_db, force=True)
        assert not con.hdfs.exists(TMP_DB_HDFS_PATH)

    if create_tarball:
        check_call('tar -xzf {0} {1}'.format(IBIS_TEST_DATA_TARBALL,
                                             IBIS_TEST_DATA_LOCAL_DIR),
                   shell=True)

    if push_to_s3:
        from boto.s3 import connect_to_region
        s3_conn = connect_to_region('us-west-2')
        bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET)
        copy_tarball_to_versioned_backup(bucket)
        key = bucket.new_key(IBIS_TEST_DATA_TARBALL)
        print('Upload tarball to S3')
        key.set_contents_from_filename(IBIS_TEST_DATA_TARBALL, replace=False)
Ejemplo n.º 7
0
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_https='default',
                 auth_mechanism='NOSASL',
                 verify=True,
                 **kwds):
    """
    Connect to HDFS

    Parameters
    ----------
    host : string, Host name of the HDFS NameNode
    port : int, NameNode's WebHDFS port (default 50070)
    protocol : {'webhdfs'}
    use_https : boolean, default 'default'
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False
    auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : boolean, Set to False to turn off verifying SSL certificates.
        (default True)

    Other keywords are forwarded to hdfs library classes

    Returns
    -------
    client : WebHDFS
    """
    import requests
    session = kwds.setdefault('session', requests.Session())
    session.verify = verify
    if auth_mechanism in ['GSSAPI', 'LDAP']:
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient
        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, **kwds)
    return WebHDFS(hdfs_client)
Ejemplo n.º 8
0
    def schema(self):
        """
        Get the schema for this table (if one is known)

        Returns
        -------
        schema : Schema
        """
        if not self._is_materialized():
            raise IbisError('Table operation is not yet materialized')
        return self.op().get_schema()
Ejemplo n.º 9
0
def timedelta(days=None,
              hours=None,
              minutes=None,
              seconds=None,
              milliseconds=None,
              microseconds=None,
              nanoseconds=None,
              weeks=None):
    """
    Generic API for creating a fixed size timedelta

    Parameters
    ----------
    days : int, default None
    weeks : int, default None
    hours : int, default None
    minutes : int, default None
    seconds : int, default None
    milliseconds : int, default None
    microseconds : int, default None
    nanoseconds : int, default None

    Notes
    -----
    For potentially non-fixed-length timedeltas (like year, month, etc.), use
    the corresponding named API (e.g. ibis.month).

    Returns
    -------
    delta : TimeIncrement (Timedelta)
    """
    out = {'result': None}

    def _apply(klass, n):
        if not n:
            return
        offset = klass(n)
        delta = out['result']
        out['result'] = delta + offset if delta else offset

    _apply(Week, weeks)
    _apply(Day, days)
    _apply(Hour, hours)
    _apply(Minute, minutes)
    _apply(Second, seconds)
    _apply(Millisecond, milliseconds)
    _apply(Microsecond, microseconds)
    _apply(Nanosecond, nanoseconds)

    result = out['result']
    if not result:
        raise IbisError('Must pass some offset parameter')

    return result
Ejemplo n.º 10
0
def load(data, udf, data_dir, overwrite):
    """Load Ibis test data and build/upload UDFs"""
    print(str(ENV))

    con = make_ibis_client()

    # validate our environment before performing possibly expensive operations
    if not can_write_to_hdfs(con):
        raise IbisError('Failed to write to HDFS; check your settings')
    if udf and not can_build_udfs():
        raise IbisError('Build environment does not support building UDFs')

    # load the data files
    if data:
        tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')
        try:
            load_impala_data(con, data_dir, overwrite)
        finally:
            shutil.rmtree(tmp_dir)
    else:
        print('Skipping Ibis test data load (--no-data)')

    # build and upload the UDFs
    if udf:
        already_loaded = is_udf_loaded(con)
        print('Attempting to build and load test UDFs')
        if already_loaded and not overwrite:
            print('UDFs already loaded and not overwriting; moving on')
        else:
            if already_loaded:
                print('UDFs already loaded; attempting to overwrite')
            print('Building UDFs')
            build_udfs()
            print('Uploading UDFs')
            upload_udfs(con)
    else:
        print('Skipping UDF build/load (--no-udf)')
Ejemplo n.º 11
0
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_kerberos=False,
                 verify=True,
                 **kwds):
    """
    Connect to HDFS

    Parameters
    ----------
    host : string
    port : int, default 50070 (webhdfs default)
    protocol : {'webhdfs'}
    use_kerberos : boolean, default False
    verify : boolean, default False
        Set to False to turn off verifying SSL certificates

    Other keywords are forwarded to hdfs library classes

    Returns
    -------
    client : ibis HDFS client
    """
    if use_kerberos:
        try:
            import requests_kerberos
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient
        url = 'https://{0}:{1}'.format(host, port)  # note SSL
        hdfs_client = KerberosClient(url,
                                     mutual_auth='OPTIONAL',
                                     verify=verify,
                                     **kwds)
    else:
        from hdfs.client import InsecureClient
        url = 'http://{0}:{1}'.format(host, port)
        hdfs_client = InsecureClient(url, verify=verify, **kwds)
    return WebHDFS(hdfs_client)
Ejemplo n.º 12
0
    def convert(self, n, from_unit, to_unit):
        i = self.ranks[from_unit]
        j = self.ranks[to_unit]

        if i == j:
            return n

        factors = self.conv_factors[min(i, j) + 1: max(i, j) + 1]
        factor = 1
        for x in factors:
            factor *= x

        if j < i:
            if n % factor:
                raise IbisError('{0} is not a multiple of {1}'.format(n,
                                                                      factor))
            return n / factor
        else:
            return n * factor
Ejemplo n.º 13
0
 def _check_connected(self):
     if not self.is_connected:
         raise IbisError('Please first connect to a Kudu cluster '
                         'with client.kudu.connect')
Ejemplo n.º 14
0
def hdfs_connect(
    host='localhost',
    port=50070,
    protocol='webhdfs',
    use_https='default',
    auth_mechanism='NOSASL',
    verify=True,
    session=None,
    **kwds,
):
    """Connect to HDFS.

    Parameters
    ----------
    host : str
        Host name of the HDFS NameNode
    port : int
        NameNode's WebHDFS port
    protocol : str,
        The protocol used to communicate with HDFS. The only valid value is
        ``'webhdfs'``.
    use_https : bool
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False.
    auth_mechanism : str
        Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : bool
        Set to :data:`False` to turn off verifying SSL certificates.
    session : Optional[requests.Session]
        A custom :class:`requests.Session` object.

    Notes
    -----
    Other keywords are forwarded to HDFS library classes.

    Returns
    -------
    WebHDFS

    """
    import requests

    if session is None:
        session = requests.Session()
    session.verify = verify
    if auth_mechanism in ('GSSAPI', 'LDAP'):
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos  # noqa: F401
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`."
            )
        from hdfs.ext.kerberos import KerberosClient

        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, session=session, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient

        url = '{}://{}:{}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, session=session, **kwds)
    return WebHDFS(hdfs_client)