def load(data, udf, data_dir, overwrite): """Load Ibis test data and build/upload UDFs""" print(str(ENV)) con = make_ibis_client() # validate our environment before performing possibly expensive operations if not can_write_to_hdfs(con): raise IbisError('Failed to write to HDFS; check your settings') if udf and not can_build_udfs(): raise IbisError('Build environment does not support building UDFs') # load the data files if data and (overwrite or not is_data_loaded(con)): try: tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_') if not data_dir: print('Did not specify a local dir with the test data, so ' 'downloading it from S3') data_dir = dnload_ibis_test_data_from_s3(tmp_dir) upload_ibis_test_data_to_hdfs(con, data_dir) create_test_database(con) parquet_tables = create_parquet_tables(con) avro_tables = create_avro_tables(con) for table in parquet_tables + avro_tables: print('Computing stats for {0}'.format(table.op().name)) table.compute_stats() finally: shutil.rmtree(tmp_dir) # build and upload the UDFs if udf and (overwrite or not is_udf_loaded(con)): build_udfs() upload_udfs(con)
def load(data, udf, data_dir, overwrite): """Load Ibis test data and build/upload UDFs""" con = make_ibis_client(ENV) # validate our environment before performing possibly expensive operations if not can_write_to_hdfs(con): raise IbisError('Failed to write to HDFS; check your settings') if udf and not can_build_udfs(): raise IbisError('Build environment does not support building UDFs') # load the data files if data: load_impala_data(con, str(data_dir), overwrite) else: logger.info('Skipping Ibis test data load (--no-data)') # build and upload the UDFs if udf: already_loaded = is_udf_loaded(con) logger.info('Attempting to build and load test UDFs') if already_loaded and not overwrite: logger.info('UDFs already loaded and not overwriting; moving on') else: if already_loaded: logger.info('UDFs already loaded; attempting to overwrite') logger.info('Building UDFs') build_udfs() logger.info('Uploading UDFs') upload_udfs(con) else: logger.info('Skipping UDF build/load (--no-udf)')
def load(data, udf, data_dir, overwrite): """Load Ibis test data and build/upload UDFs""" print(str(ENV)) con = make_ibis_client() # validate our environment before performing possibly expensive operations if not can_write_to_hdfs(con): raise IbisError('Failed to write to HDFS; check your settings') if udf and not can_build_udfs(): raise IbisError('Build environment does not support building UDFs') # load the data files if data: already_loaded = is_data_loaded(con) print('Attempting to load Ibis test data (--data)') if already_loaded and not overwrite: print('Data is already loaded and not overwriting; moving on') else: if already_loaded: print('Data is already loaded; attempting to overwrite') tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_') try: if not data_dir: print('Did not specify a local dir with the test data, so ' 'downloading it from S3') data_dir = dnload_ibis_test_data_from_s3(tmp_dir) print('Uploading to HDFS') upload_ibis_test_data_to_hdfs(con, data_dir) print('Creating Ibis test data database') create_test_database(con) parquet_tables = create_parquet_tables(con) avro_tables = create_avro_tables(con) for table in parquet_tables + avro_tables: print('Computing stats for {0}'.format(table.op().name)) table.compute_stats() # sqlite database sqlite_src = osp.join(data_dir, 'ibis_testing.db') shutil.copy(sqlite_src, '.') finally: shutil.rmtree(tmp_dir) else: print('Skipping Ibis test data load (--no-data)') # build and upload the UDFs if udf: already_loaded = is_udf_loaded(con) print('Attempting to build and load test UDFs') if already_loaded and not overwrite: print('UDFs already loaded and not overwriting; moving on') else: if already_loaded: print('UDFs already loaded; attempting to overwrite') print('Building UDFs') build_udfs() print('Uploading UDFs') upload_udfs(con) else: print('Skipping UDF build/load (--no-udf)')
def create(create_tarball, push_to_s3): """Create Ibis test data""" print(str(ENV)) con = make_ibis_client() # verify some assumptions before proceeding if push_to_s3 and not create_tarball: raise IbisError( "Must specify --create-tarball if specifying --push-to-s3") if os.path.exists(IBIS_TEST_DATA_LOCAL_DIR): raise IbisError( 'Local dir {} already exists; please remove it first'.format( IBIS_TEST_DATA_LOCAL_DIR)) if not con.exists_database('tpch'): raise IbisError('`tpch` database does not exist') if not con.hdfs.exists('/test-warehouse/tpch.region_avro'): raise IbisError( 'HDFS dir /test-warehouse/tpch.region_avro does not exist') # generate tmp identifiers tmp_db_hdfs_path = os.path.join(ENV.tmp_dir, guid()) tmp_db = guid() os.mkdir(IBIS_TEST_DATA_LOCAL_DIR) try: # create the tmp data locally con.create_database(tmp_db, path=tmp_db_hdfs_path) print('Created database {} at {}'.format(tmp_db, tmp_db_hdfs_path)) # create the local data set scrape_parquet_files(tmp_db, con) download_parquet_files(con, tmp_db_hdfs_path) download_avro_files(con) generate_csv_files() # Only populate SQLite here engines = [get_sqlite_engine()] load_sql_databases(con, engines) finally: con.drop_database(tmp_db, force=True) assert not con.hdfs.exists(tmp_db_hdfs_path) if create_tarball: check_call('tar -zc {} > {}'.format(IBIS_TEST_DATA_LOCAL_DIR, TARBALL_NAME), shell=True) if push_to_s3: import boto s3_conn = boto.connect_s3(IBIS_TEST_AWS_KEY_ID, IBIS_TEST_AWS_SECRET) bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET) # copy_tarball_to_versioned_backup(bucket) key = bucket.new_key(IBIS_TEST_DATA_TARBALL) print('Upload tarball to S3') key.set_contents_from_filename(TARBALL_NAME, replace=True)
def load(data, udf, data_dir, overwrite): """Load Ibis test data and build/upload UDFs""" print(str(ENV)) con = make_ibis_client() # validate our environment before performing possibly expensive operations if not can_write_to_hdfs(con): raise IbisError('Failed to write to HDFS; check your settings') if udf and not can_build_udfs(): raise IbisError('Build environment does not support building UDFs') # load the data files if data: tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_') if not data_dir: # TODO(wesm): do not download if already downloaded print('Did not specify a local dir with the test data, so ' 'downloading it from S3') data_dir = dnload_ibis_test_data_from_s3(tmp_dir) try: load_impala_data(con, data_dir, overwrite) # sqlite database print('Setting up SQLite') sqlite_src = os.path.join(data_dir, 'ibis_testing.db') shutil.copy(sqlite_src, '.') print('Loading SQL engines') # SQL engines engines = [get_postgres_engine()] load_sql_databases(con, engines) finally: shutil.rmtree(tmp_dir) else: print('Skipping Ibis test data load (--no-data)') # build and upload the UDFs if udf: already_loaded = is_udf_loaded(con) print('Attempting to build and load test UDFs') if already_loaded and not overwrite: print('UDFs already loaded and not overwriting; moving on') else: if already_loaded: print('UDFs already loaded; attempting to overwrite') print('Building UDFs') build_udfs() print('Uploading UDFs') upload_udfs(con) else: print('Skipping UDF build/load (--no-udf)')
def create(create_tarball, push_to_s3): """Create Ibis test data""" print(str(ENV)) con = make_ibis_client() # verify some assumptions before proceeding if push_to_s3 and not create_tarball: raise IbisError( "Must specify --create-tarball if specifying --push-to-s3") if osp.exists(IBIS_TEST_DATA_LOCAL_DIR): raise IbisError( 'Local dir {0} already exists; please remove it first'.format( IBIS_TEST_DATA_LOCAL_DIR)) if not con.exists_database('tpch'): raise IbisError('`tpch` database does not exist') if not con.hdfs.exists('/test-warehouse/tpch.region_avro'): raise IbisError( 'HDFS dir /test-warehouse/tpch.region_avro does not exist') # generate tmp identifiers tmp_db_hdfs_path = pjoin(ENV.tmp_dir, guid()) tmp_db = guid() os.mkdir(IBIS_TEST_DATA_LOCAL_DIR) try: # create the tmp data locally con.create_database(tmp_db, path=tmp_db_hdfs_path) print('Created database {0} at {1}'.format(tmp_db, tmp_db_hdfs_path)) # create the local data set scrape_parquet_files(con) download_parquet_files(con, tmp_db_hdfs_path) download_avro_files(con) generate_csv_files() finally: con.drop_database(tmp_db, force=True) assert not con.hdfs.exists(TMP_DB_HDFS_PATH) if create_tarball: check_call('tar -xzf {0} {1}'.format(IBIS_TEST_DATA_TARBALL, IBIS_TEST_DATA_LOCAL_DIR), shell=True) if push_to_s3: from boto.s3 import connect_to_region s3_conn = connect_to_region('us-west-2') bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET) copy_tarball_to_versioned_backup(bucket) key = bucket.new_key(IBIS_TEST_DATA_TARBALL) print('Upload tarball to S3') key.set_contents_from_filename(IBIS_TEST_DATA_TARBALL, replace=False)
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, **kwds): """ Connect to HDFS Parameters ---------- host : string, Host name of the HDFS NameNode port : int, NameNode's WebHDFS port (default 50070) protocol : {'webhdfs'} use_https : boolean, default 'default' Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : boolean, Set to False to turn off verifying SSL certificates. (default True) Other keywords are forwarded to hdfs library classes Returns ------- client : WebHDFS """ import requests session = kwds.setdefault('session', requests.Session()) session.verify = verify if auth_mechanism in ['GSSAPI', 'LDAP']: if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{0}://{1}:{2}'.format(prefix, host, port) hdfs_client = InsecureClient(url, **kwds) return WebHDFS(hdfs_client)
def schema(self): """ Get the schema for this table (if one is known) Returns ------- schema : Schema """ if not self._is_materialized(): raise IbisError('Table operation is not yet materialized') return self.op().get_schema()
def timedelta(days=None, hours=None, minutes=None, seconds=None, milliseconds=None, microseconds=None, nanoseconds=None, weeks=None): """ Generic API for creating a fixed size timedelta Parameters ---------- days : int, default None weeks : int, default None hours : int, default None minutes : int, default None seconds : int, default None milliseconds : int, default None microseconds : int, default None nanoseconds : int, default None Notes ----- For potentially non-fixed-length timedeltas (like year, month, etc.), use the corresponding named API (e.g. ibis.month). Returns ------- delta : TimeIncrement (Timedelta) """ out = {'result': None} def _apply(klass, n): if not n: return offset = klass(n) delta = out['result'] out['result'] = delta + offset if delta else offset _apply(Week, weeks) _apply(Day, days) _apply(Hour, hours) _apply(Minute, minutes) _apply(Second, seconds) _apply(Millisecond, milliseconds) _apply(Microsecond, microseconds) _apply(Nanosecond, nanoseconds) result = out['result'] if not result: raise IbisError('Must pass some offset parameter') return result
def load(data, udf, data_dir, overwrite): """Load Ibis test data and build/upload UDFs""" print(str(ENV)) con = make_ibis_client() # validate our environment before performing possibly expensive operations if not can_write_to_hdfs(con): raise IbisError('Failed to write to HDFS; check your settings') if udf and not can_build_udfs(): raise IbisError('Build environment does not support building UDFs') # load the data files if data: tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_') try: load_impala_data(con, data_dir, overwrite) finally: shutil.rmtree(tmp_dir) else: print('Skipping Ibis test data load (--no-data)') # build and upload the UDFs if udf: already_loaded = is_udf_loaded(con) print('Attempting to build and load test UDFs') if already_loaded and not overwrite: print('UDFs already loaded and not overwriting; moving on') else: if already_loaded: print('UDFs already loaded; attempting to overwrite') print('Building UDFs') build_udfs() print('Uploading UDFs') upload_udfs(con) else: print('Skipping UDF build/load (--no-udf)')
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_kerberos=False, verify=True, **kwds): """ Connect to HDFS Parameters ---------- host : string port : int, default 50070 (webhdfs default) protocol : {'webhdfs'} use_kerberos : boolean, default False verify : boolean, default False Set to False to turn off verifying SSL certificates Other keywords are forwarded to hdfs library classes Returns ------- client : ibis HDFS client """ if use_kerberos: try: import requests_kerberos except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient url = 'https://{0}:{1}'.format(host, port) # note SSL hdfs_client = KerberosClient(url, mutual_auth='OPTIONAL', verify=verify, **kwds) else: from hdfs.client import InsecureClient url = 'http://{0}:{1}'.format(host, port) hdfs_client = InsecureClient(url, verify=verify, **kwds) return WebHDFS(hdfs_client)
def convert(self, n, from_unit, to_unit): i = self.ranks[from_unit] j = self.ranks[to_unit] if i == j: return n factors = self.conv_factors[min(i, j) + 1: max(i, j) + 1] factor = 1 for x in factors: factor *= x if j < i: if n % factor: raise IbisError('{0} is not a multiple of {1}'.format(n, factor)) return n / factor else: return n * factor
def _check_connected(self): if not self.is_connected: raise IbisError('Please first connect to a Kudu cluster ' 'with client.kudu.connect')
def hdfs_connect( host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, session=None, **kwds, ): """Connect to HDFS. Parameters ---------- host : str Host name of the HDFS NameNode port : int NameNode's WebHDFS port protocol : str, The protocol used to communicate with HDFS. The only valid value is ``'webhdfs'``. use_https : bool Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False. auth_mechanism : str Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : bool Set to :data:`False` to turn off verifying SSL certificates. session : Optional[requests.Session] A custom :class:`requests.Session` object. Notes ----- Other keywords are forwarded to HDFS library classes. Returns ------- WebHDFS """ import requests if session is None: session = requests.Session() session.verify = verify if auth_mechanism in ('GSSAPI', 'LDAP'): if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos # noqa: F401 except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`." ) from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, session=session, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{}://{}:{}'.format(prefix, host, port) hdfs_client = InsecureClient(url, session=session, **kwds) return WebHDFS(hdfs_client)