def __getitem__(self, obj): """'Indexing' functionality for the BigDataFrame Given a single object or list, the BDF will interpret it as a relational projection (i.e., a selection of columns). Given a tuple of length 2, the first element will be interpreted for row selection (i.e., predicate/filter/WHERE clause), while the second element will be interpreted as a projection. """ # other select/filter fns should be implemented with this one if isinstance(obj, tuple) and len(obj) == 2: alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) (limit_elt, where) = self._query_ast._filter(obj[0]) select_list = self._query_ast._projection(obj[1]) return BigDataFrame(self._ic, SelectStmt(select_list, table_ref, where=where, limit=limit_elt)) elif isinstance(obj, list): alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) select_list = self._query_ast._projection(obj) return BigDataFrame(self._ic, SelectStmt(select_list, table_ref)) else: # single object, possibly a slice; wrap in list and get projection return self[[obj]]
def __getitem__(self, obj): """'Indexing' functionality for the BigDataFrame Given a single object or list, the BDF will interpret it as a relational projection (i.e., a selection of columns). Given a tuple of length 2, the first element will be interpreted for row selection (i.e., predicate/filter/WHERE clause), while the second element will be interpreted as a projection. """ # other select/filter fns should be implemented with this one if isinstance(obj, tuple) and len(obj) == 2: alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) (limit_elt, where) = self._query_ast._filter(obj[0]) select_list = self._query_ast._projection(obj[1]) return BigDataFrame( self._ic, SelectStmt(select_list, table_ref, where=where, limit=limit_elt)) elif isinstance(obj, list): alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) select_list = self._query_ast._projection(obj) return BigDataFrame(self._ic, SelectStmt(select_list, table_ref)) else: # single object, possibly a slice; wrap in list and get projection return self[[obj]]
def schema(self): if self._schema is None: table_ref = InlineView( self._query_ast.to_sql(), _random_id('inline_', 4)) self._schema = _get_table_schema_hack( self._ic._cursor, table_ref.to_sql()) return self._schema
def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError( "Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())
def from_sql_query(ic, query, alias=None): """Create a BDF from a SQL query executed by Impala""" query_alias = alias if alias else _random_id('inline_', 4) table_ref = InlineView(query, query_alias) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple([SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """ Materialize the results and stores them in HFDS. Functions as an EXTERNAL table. Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
def schema(self): if self._schema is None: table_ref = InlineView(self._query_ast.to_sql(), _random_id('inline_', 4)) self._schema = _get_table_schema_hack(self._ic._cursor, table_ref.to_sql()) return self._schema
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table_as_select( table_name, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) query = create_stmt + self.to_sql() self._cursor.execute(query) return from_sql_table(self._ic, table_name.to_sql())
class ImpalaConnectionTests(unittest.TestCase): table_prefix = _random_id(prefix='dbapi20test_') tablename = table_prefix + 'contests' def setUp(self): self.connection = None def tearDown(self): if self.connection: self.connection.close() def _execute_queries(self, con): ddl = """ CREATE TABLE {0} ( f1 INT, f2 INT) """.format(self.tablename) try: cur = con.cursor() cur.execute(ddl) con.commit() cur.execute('DROP TABLE {0}'.format(self.tablename)) con.commit() except: raise def test_impala_nosasl_connect(self): self.connection = connect(ENV.host, ENV.port, timeout=5) self._execute_queries(self.connection) @pytest.mark.skipif(ENV.skip_hive_tests, reason="Skipping hive tests") def test_hive_plain_connect(self): self.connection = connect(ENV.host, ENV.hive_port, auth_mechanism="PLAIN", timeout=5, user=ENV.hive_user, password="******") self._execute_queries(self.connection) @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR) def test_impala_plain_connect(self): self.connection = connect(ENV.host, ENV.port, auth_mechanism="PLAIN", timeout=5, user=ENV.hive_user, password="******") self._execute_queries(self.connection) @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR) def test_hive_nosasl_connect(self): self.connection = connect(ENV.host, ENV.hive_port, timeout=5) self._execute_queries(self.connection) def test_bad_auth(self): """Test some simple error messages""" try: connect(ENV.host, ENV.port, auth_mechanism="foo") assert False, "should have got exception" except NotSupportedError as e: assert 'Unsupported authentication mechanism: FOO' in str(e)
def from_sql_query(ic, query, alias=None): """Create a BDF from a SQL query executed by Impala""" query_alias = alias if alias else _random_id('inline_', 4) table_ref = InlineView(query, query_alias) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple( [SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
class ImpalaConnectionTests(unittest.TestCase): table_prefix = _random_id(prefix='dbapi20test_') tablename = table_prefix + 'contests' def setUp(self): self.connection = None def tearDown(self): if self.connection: self.connection.close() def _execute_queries(self, con): ddl = """ CREATE TABLE {0} ( f1 INT, f2 INT) """.format(self.tablename) try: cur = con.cursor() cur.execute(ddl) con.commit() cur.execute('DROP TABLE {0}'.format(self.tablename)) con.commit() except: raise def test_impala_nosasl_connect(self): self.connection = connect(ENV.host, ENV.port, timeout=5) self._execute_queries(self.connection) @pytest.mark.skipif(ENV.skip_hive_tests, reason="Skipping hive tests") def test_hive_plain_connect(self): self.connection = connect(ENV.host, ENV.hive_port, auth_mechanism="PLAIN", timeout=5, user=ENV.hive_user, password="******") self._execute_queries(self.connection) @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR) def test_impala_plain_connect(self): self.connection = connect(ENV.host, ENV.port, auth_mechanism="PLAIN", timeout=5, user=ENV.hive_user, password="******") self._execute_queries(self.connection) @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR) def test_hive_nosasl_connect(self): self.connection = connect(ENV.host, ENV.hive_port, timeout=5) self._execute_queries(self.connection)
def take(self, n): """Return `n` rows as a pandas `DataFrame` Distributed and no notion of order, so not guaranteed to be reproducible. """ alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) select_list = [SelectItem(table_name=TableName(table_ref.name))] # SELECT alias.* limit_elt = LimitElement(Literal(n), None) ast = SelectStmt(select_list, table_ref, limit=limit_elt) bdf = BigDataFrame(self._ic, ast) return as_pandas(bdf.__iter__())
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic assert isinstance(ic, ImpalaContext) temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError( "must supply a path for EXTERNAL table for webhdfs") hdfs_client = ic.hdfs_client() raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.write( os.path.join(path, 'data.txt'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError( "method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
def take(self, n): """Return `n` rows as a pandas `DataFrame` Distributed and no notion of order, so not guaranteed to be reproducible. """ alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) # SELECT alias.* select_list = [SelectItem(table_name=TableName(table_ref.name))] limit_elt = LimitElement(Literal(n), None) ast = SelectStmt(select_list, table_ref, limit=limit_elt) bdf = BigDataFrame(self._ic, ast) return as_pandas(bdf.__iter__())
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS. Functions as an EXTERNAL table. Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
def __init__(self, temp_dir=None, temp_db=None, nn_host=None, webhdfs_port=50070, hdfs_user=None, *args, **kwargs): # args and kwargs get passed directly into impala.dbapi.connect() suffix = _random_id(length=8) self._temp_dir = '/tmp/impyla-%s' % suffix if temp_dir is None else temp_dir self._temp_db = 'tmp_impyla_%s' % suffix if temp_db is None else temp_db self._conn = connect(*args, **kwargs) self._cursor = self._conn.cursor() # used for pywebhdfs cleanup of temp dir; not required self._nn_host = nn_host self._webhdfs_port = webhdfs_port self._hdfs_user = hdfs_user if temp_db is None: self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" % (self._temp_db, self._temp_dir))
def __init__(self, temp_dir=None, temp_db=None, nn_host=None, webhdfs_port=50070, hdfs_user=None, *args, **kwargs): # args and kwargs get passed directly into impala.dbapi.connect() suffix = _random_id(length=8) self._temp_dir = '/tmp/impyla-%s' % ( suffix if temp_dir is None else temp_dir) self._temp_db = 'tmp_impyla_%s' % ( suffix if temp_db is None else temp_db) self._conn = connect(*args, **kwargs) self._cursor = self._conn.cursor() # used for webhdfs cleanup of temp dir; not required self._nn_host = nn_host self._webhdfs_port = webhdfs_port self._hdfs_user = hdfs_user self._kerberized = self._conn.kerberized() if temp_db is None: self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" % (self._temp_db, self._temp_dir))
class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test): driver = impala.dbapi connect_kw_args = connect_kw_args table_prefix = _random_id(prefix='dbapi20test_') ddl1 = 'create table %sbooze (name string)' % table_prefix ddl2 = 'create table %sbarflys (name string)' % table_prefix xddl1 = 'drop table %sbooze' % table_prefix xddl2 = 'drop table %sbarflys' % table_prefix def test_nextset(self): pass def test_setoutputsize(self): pass @pytest.mark.skipif(protocol == 'beeswax', reason='Beeswax messes up NULL') def test_None(self): return super(ImpalaDBAPI20Test, self).test_None()
def from_hdfs(ic, path, schema, table=None, overwrite=False, file_format='TEXTFILE', partition_schema=None, field_terminator='\t', line_terminator='\n', escape_char='\\'): """Create a BDF backed by an external file in HDFS. File must be Impala-compatible """ if partition_schema is not None: raise NotImplementedError("Partitions not yet implemented in .from_hdfs()") if table is None: temp_table = _random_id('tmp_table_', 8) table = "%s.%s" % (ic._temp_db, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) return from_sql_table(ic, table_name.to_sql())
def store(self, path=None, table=None, file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False): """Materialize the results and stores them in HFDS Implemented through a `CREATE TABLE AS SELECT`. """ temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (self._temp_db, temp_table) if path is None: path = os.path.join(self._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) create_stmt = _create_table_as_select(table_name, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) query = create_stmt + self.to_sql() self._cursor.execute(query) return from_sql_table(self._ic, table_name.to_sql())
def tmp_db(): return _random_id('tmp_impyla_')
def tmp_db(): return _random_id("tmp_impyla_")
from __future__ import absolute_import, print_function import pytest import impala.dbapi from impala.tests.util import ImpylaTestEnv from impala.util import ( _random_id, force_drop_impala_database, force_drop_hive_database) # must import the module, rather than the class, per comment in module from impala.tests import _dbapi20_tests ENV = ImpylaTestEnv() tmp_db = _random_id('tmp_impyla_dbapi_') hive = ENV.auth_mech == 'PLAIN' @pytest.mark.connect class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test): driver = impala.dbapi connect_kw_args = {'host': ENV.host, 'port': ENV.port, 'auth_mechanism': ENV.auth_mech, 'database': tmp_db} ddl1 = 'create table {0}booze (name string)'.format( _dbapi20_tests.DatabaseAPI20Test.table_prefix) ddl2 = 'create table {0}barflys (name string)'.format( _dbapi20_tests.DatabaseAPI20Test.table_prefix)
""" from __future__ import absolute_import, print_function import pytest import impala.dbapi from impala.tests.util import ImpylaTestEnv from impala.util import _random_id, force_drop_database # must import the module, rather than the class, per comment in module from impala.tests import _dbapi20_tests ENV = ImpylaTestEnv() tmp_db = _random_id('tmp_impyla_dbapi_') @pytest.mark.connect class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test): driver = impala.dbapi connect_kw_args = {'host': ENV.host, 'port': ENV.port, 'auth_mechanism': ENV.auth_mech, 'database': tmp_db} ddl1 = 'create table {0}booze (name string)'.format( _dbapi20_tests.DatabaseAPI20Test.table_prefix) ddl2 = 'create table {0}barflys (name string)'.format( _dbapi20_tests.DatabaseAPI20Test.table_prefix)
class ImpalaDecimalTests(unittest.TestCase): driver = impala.dbapi table_prefix = _random_id(prefix='dbapi20test_') tablename = table_prefix + 'decimaltests' def _connect(self): try: return self.driver.connect(**connect_kw_args) except AttributeError: self.fail("No connect method found in self.driver module") def setUp(self): ddl = """ CREATE TABLE {0} ( f1 decimal(10, 2), f2 decimal(7, 5), f3 decimal(38, 17)) """.format(self.tablename) con = self._connect() try: cur = con.cursor() cur.execute(ddl) con.commit() except: raise finally: con.close() def tearDown(self): con = self._connect() try: cur = con.cursor() cur.execute('drop table {0}'.format(self.tablename)) con.commit() except: raise finally: con.close() def test_cursor_description_precision_scale(self): # According to the DBAPI 2.0, these are the 7 fields of the cursor # description # - name # - type_code # - display_size # - internal_size # - precision # - scale # - null_ok cases = [(10, 2), (7, 5), (38, 17)] con = self._connect() try: cur = con.cursor() cur.execute('select * from {0} limit 0'.format(self.tablename)) desc = cur.description for (ex_p, ex_s), val in zip(cases, desc): assert val[4] == ex_p assert val[5] == ex_s con.commit() finally: con.close()