Ejemplo n.º 1
0
    def __getitem__(self, obj):
        """'Indexing' functionality for the BigDataFrame

        Given a single object or list, the BDF will interpret it as a relational
        projection (i.e., a selection of columns).

        Given a tuple of length 2, the first element will be interpreted for row
        selection (i.e., predicate/filter/WHERE clause), while the second
        element will be interpreted as a projection.
        """
        # other select/filter fns should be implemented with this one
        if isinstance(obj, tuple) and len(obj) == 2:
            alias = _random_id('inline_', 4)
            table_ref = InlineView(self._query_ast.to_sql(), alias)
            (limit_elt, where) = self._query_ast._filter(obj[0])
            select_list = self._query_ast._projection(obj[1])
            return BigDataFrame(self._ic, SelectStmt(select_list, table_ref, where=where, limit=limit_elt))
        elif isinstance(obj, list):
            alias = _random_id('inline_', 4)
            table_ref = InlineView(self._query_ast.to_sql(), alias)
            select_list = self._query_ast._projection(obj)
            return BigDataFrame(self._ic, SelectStmt(select_list, table_ref))
        else:
            # single object, possibly a slice; wrap in list and get projection
            return self[[obj]]
Ejemplo n.º 2
0
    def __getitem__(self, obj):
        """'Indexing' functionality for the BigDataFrame

        Given a single object or list, the BDF will interpret it as a
        relational projection (i.e., a selection of columns).

        Given a tuple of length 2, the first element will be interpreted for
        row selection (i.e., predicate/filter/WHERE clause), while the second
        element will be interpreted as a projection.
        """
        # other select/filter fns should be implemented with this one
        if isinstance(obj, tuple) and len(obj) == 2:
            alias = _random_id('inline_', 4)
            table_ref = InlineView(self._query_ast.to_sql(), alias)
            (limit_elt, where) = self._query_ast._filter(obj[0])
            select_list = self._query_ast._projection(obj[1])
            return BigDataFrame(
                self._ic,
                SelectStmt(select_list,
                           table_ref,
                           where=where,
                           limit=limit_elt))
        elif isinstance(obj, list):
            alias = _random_id('inline_', 4)
            table_ref = InlineView(self._query_ast.to_sql(), alias)
            select_list = self._query_ast._projection(obj)
            return BigDataFrame(self._ic, SelectStmt(select_list, table_ref))
        else:
            # single object, possibly a slice; wrap in list and get projection
            return self[[obj]]
Ejemplo n.º 3
0
Archivo: bdf.py Proyecto: cgc17/impyla
 def schema(self):
     if self._schema is None:
         table_ref = InlineView(
             self._query_ast.to_sql(), _random_id('inline_', 4))
         self._schema = _get_table_schema_hack(
             self._ic._cursor, table_ref.to_sql())
     return self._schema
Ejemplo n.º 4
0
def from_hdfs(ic,
              path,
              schema,
              table=None,
              overwrite=False,
              file_format='TEXTFILE',
              partition_schema=None,
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError(
            "Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name,
                                schema,
                                path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())
Ejemplo n.º 5
0
def from_sql_query(ic, query, alias=None):
    """Create a BDF from a SQL query executed by Impala"""
    query_alias = alias if alias else _random_id('inline_', 4)
    table_ref = InlineView(query, query_alias)
    schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql())
    select_list = tuple([SelectItem(expr=Literal(col)) for (col, ty) in schema])
    return BigDataFrame(ic, SelectStmt(select_list, table_ref))
Ejemplo n.º 6
0
    def store(self,
              path=None,
              table=None,
              file_format='TEXTFILE',
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\',
              overwrite=False):
        """
        Materialize the results and stores them in HFDS. Functions as an
        EXTERNAL table.

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        return self._store(path=path,
                           table_name=table_name,
                           file_format=file_format,
                           field_terminator=field_terminator,
                           line_terminator=line_terminator,
                           escape_char=escape_char,
                           overwrite=overwrite)
Ejemplo n.º 7
0
 def schema(self):
     if self._schema is None:
         table_ref = InlineView(self._query_ast.to_sql(),
                                _random_id('inline_', 4))
         self._schema = _get_table_schema_hack(self._ic._cursor,
                                               table_ref.to_sql())
     return self._schema
Ejemplo n.º 8
0
    def store(self,
              path=None,
              table=None,
              file_format='TEXTFILE',
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\',
              overwrite=False):
        """Materialize the results and stores them in HFDS

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        if overwrite:
            self._cursor.execute("DROP TABLE IF EXISTS %s" %
                                 table_name.to_sql())
        create_stmt = _create_table_as_select(
            table_name,
            path=path,
            file_format=file_format,
            field_terminator=field_terminator,
            line_terminator=line_terminator,
            escape_char=escape_char)
        query = create_stmt + self.to_sql()
        self._cursor.execute(query)
        return from_sql_table(self._ic, table_name.to_sql())
Ejemplo n.º 9
0
class ImpalaConnectionTests(unittest.TestCase):

    table_prefix = _random_id(prefix='dbapi20test_')
    tablename = table_prefix + 'contests'

    def setUp(self):
        self.connection = None

    def tearDown(self):
        if self.connection:
            self.connection.close()

    def _execute_queries(self, con):
        ddl = """
            CREATE TABLE {0} (
              f1 INT,
              f2 INT)
        """.format(self.tablename)
        try:
            cur = con.cursor()
            cur.execute(ddl)
            con.commit()
            cur.execute('DROP TABLE {0}'.format(self.tablename))
            con.commit()
        except:
            raise

    def test_impala_nosasl_connect(self):
        self.connection = connect(ENV.host, ENV.port, timeout=5)
        self._execute_queries(self.connection)

    @pytest.mark.skipif(ENV.skip_hive_tests, reason="Skipping hive tests")
    def test_hive_plain_connect(self):
        self.connection = connect(ENV.host, ENV.hive_port,
                                  auth_mechanism="PLAIN",
                                  timeout=5,
                                  user=ENV.hive_user,
                                  password="******")
        self._execute_queries(self.connection)

    @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR)
    def test_impala_plain_connect(self):
        self.connection = connect(ENV.host, ENV.port, auth_mechanism="PLAIN",
                                  timeout=5,
                                  user=ENV.hive_user,
                                  password="******")
        self._execute_queries(self.connection)

    @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR)
    def test_hive_nosasl_connect(self):
        self.connection = connect(ENV.host, ENV.hive_port, timeout=5)
        self._execute_queries(self.connection)

    def test_bad_auth(self):
        """Test some simple error messages"""
        try:
            connect(ENV.host, ENV.port, auth_mechanism="foo")
            assert False, "should have got exception"
        except NotSupportedError as e:
            assert 'Unsupported authentication mechanism: FOO' in str(e)
Ejemplo n.º 10
0
def from_sql_query(ic, query, alias=None):
    """Create a BDF from a SQL query executed by Impala"""
    query_alias = alias if alias else _random_id('inline_', 4)
    table_ref = InlineView(query, query_alias)
    schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql())
    select_list = tuple(
        [SelectItem(expr=Literal(col)) for (col, ty) in schema])
    return BigDataFrame(ic, SelectStmt(select_list, table_ref))
Ejemplo n.º 11
0
class ImpalaConnectionTests(unittest.TestCase):

    table_prefix = _random_id(prefix='dbapi20test_')
    tablename = table_prefix + 'contests'

    def setUp(self):
        self.connection = None

    def tearDown(self):
        if self.connection:
            self.connection.close()

    def _execute_queries(self, con):
        ddl = """
            CREATE TABLE {0} (
              f1 INT,
              f2 INT)
        """.format(self.tablename)
        try:
            cur = con.cursor()
            cur.execute(ddl)
            con.commit()
            cur.execute('DROP TABLE {0}'.format(self.tablename))
            con.commit()
        except:
            raise

    def test_impala_nosasl_connect(self):
        self.connection = connect(ENV.host, ENV.port, timeout=5)
        self._execute_queries(self.connection)

    @pytest.mark.skipif(ENV.skip_hive_tests, reason="Skipping hive tests")
    def test_hive_plain_connect(self):
        self.connection = connect(ENV.host,
                                  ENV.hive_port,
                                  auth_mechanism="PLAIN",
                                  timeout=5,
                                  user=ENV.hive_user,
                                  password="******")
        self._execute_queries(self.connection)

    @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR)
    def test_impala_plain_connect(self):
        self.connection = connect(ENV.host,
                                  ENV.port,
                                  auth_mechanism="PLAIN",
                                  timeout=5,
                                  user=ENV.hive_user,
                                  password="******")
        self._execute_queries(self.connection)

    @pytest.mark.skipif(DEFAULT_AUTH, reason=DEFAULT_AUTH_ERROR)
    def test_hive_nosasl_connect(self):
        self.connection = connect(ENV.host, ENV.hive_port, timeout=5)
        self._execute_queries(self.connection)
Ejemplo n.º 12
0
    def take(self, n):
        """Return `n` rows as a pandas `DataFrame`

        Distributed and no notion of order, so not guaranteed to be
        reproducible.
        """
        alias = _random_id('inline_', 4)
        table_ref = InlineView(self._query_ast.to_sql(), alias)
        select_list = [SelectItem(table_name=TableName(table_ref.name))] # SELECT alias.*
        limit_elt = LimitElement(Literal(n), None)
        ast = SelectStmt(select_list, table_ref, limit=limit_elt)
        bdf = BigDataFrame(self._ic, ast)
        return as_pandas(bdf.__iter__())
Ejemplo n.º 13
0
def from_pandas(ic, df, table=None, path=None, method='in_query',
                file_format='TEXTFILE', field_terminator='\t',
                line_terminator='\n', escape_char='\\', overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala

    path is the dir, not the filename
    """
    # TODO: this is not atomic
    assert isinstance(ic, ImpalaContext)
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row))
                            for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError(
                "must supply a path for EXTERNAL table for webhdfs")
        hdfs_client = ic.hdfs_client()
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                  line_terminator=line_terminator, quoting=csv.QUOTE_NONE,
                  escapechar=escape_char, header=False, index=False)
        hdfs_client.write(
            os.path.join(path, 'data.txt'), raw_data.getvalue(),
            overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError(
            "method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
Ejemplo n.º 14
0
    def take(self, n):
        """Return `n` rows as a pandas `DataFrame`

        Distributed and no notion of order, so not guaranteed to be
        reproducible.
        """
        alias = _random_id('inline_', 4)
        table_ref = InlineView(self._query_ast.to_sql(), alias)
        # SELECT alias.*
        select_list = [SelectItem(table_name=TableName(table_ref.name))]
        limit_elt = LimitElement(Literal(n), None)
        ast = SelectStmt(select_list, table_ref, limit=limit_elt)
        bdf = BigDataFrame(self._ic, ast)
        return as_pandas(bdf.__iter__())
Ejemplo n.º 15
0
Archivo: bdf.py Proyecto: cgc17/impyla
    def store(self, path=None, table=None, file_format='TEXTFILE',
              field_terminator='\t', line_terminator='\n', escape_char='\\',
              overwrite=False):
        """Materialize the results and stores them in HFDS. Functions as an EXTERNAL table.

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator,
                    line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
Ejemplo n.º 16
0
 def __init__(self, temp_dir=None, temp_db=None, nn_host=None,
         webhdfs_port=50070, hdfs_user=None, *args, **kwargs):
     # args and kwargs get passed directly into impala.dbapi.connect()
     suffix = _random_id(length=8)
     self._temp_dir = '/tmp/impyla-%s' % suffix if temp_dir is None else temp_dir
     self._temp_db = 'tmp_impyla_%s' % suffix if temp_db is None else temp_db
     self._conn = connect(*args, **kwargs)
     self._cursor = self._conn.cursor()
     # used for pywebhdfs cleanup of temp dir; not required
     self._nn_host = nn_host
     self._webhdfs_port = webhdfs_port
     self._hdfs_user = hdfs_user
     if temp_db is None:
         self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" %
                 (self._temp_db, self._temp_dir))
Ejemplo n.º 17
0
 def __init__(self, temp_dir=None, temp_db=None, nn_host=None,
              webhdfs_port=50070, hdfs_user=None, *args, **kwargs):
     # args and kwargs get passed directly into impala.dbapi.connect()
     suffix = _random_id(length=8)
     self._temp_dir = '/tmp/impyla-%s' % (
         suffix if temp_dir is None else temp_dir)
     self._temp_db = 'tmp_impyla_%s' % (
         suffix if temp_db is None else temp_db)
     self._conn = connect(*args, **kwargs)
     self._cursor = self._conn.cursor()
     # used for webhdfs cleanup of temp dir; not required
     self._nn_host = nn_host
     self._webhdfs_port = webhdfs_port
     self._hdfs_user = hdfs_user
     self._kerberized = self._conn.kerberized()
     if temp_db is None:
         self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" %
                              (self._temp_db, self._temp_dir))
Ejemplo n.º 18
0
class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test):
    driver = impala.dbapi
    connect_kw_args = connect_kw_args

    table_prefix = _random_id(prefix='dbapi20test_')
    ddl1 = 'create table %sbooze (name string)' % table_prefix
    ddl2 = 'create table %sbarflys (name string)' % table_prefix
    xddl1 = 'drop table %sbooze' % table_prefix
    xddl2 = 'drop table %sbarflys' % table_prefix

    def test_nextset(self):
        pass

    def test_setoutputsize(self):
        pass

    @pytest.mark.skipif(protocol == 'beeswax', reason='Beeswax messes up NULL')
    def test_None(self):
        return super(ImpalaDBAPI20Test, self).test_None()
Ejemplo n.º 19
0
def from_hdfs(ic, path, schema, table=None, overwrite=False,
        file_format='TEXTFILE', partition_schema=None,
        field_terminator='\t', line_terminator='\n', escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError("Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())
Ejemplo n.º 20
0
    def store(self, path=None, table=None, file_format='TEXTFILE',
            field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False):
        """Materialize the results and stores them in HFDS

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        if overwrite:
            self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
        create_stmt = _create_table_as_select(table_name, path=path,
                file_format=file_format, field_terminator=field_terminator,
                line_terminator=line_terminator, escape_char=escape_char)
        query = create_stmt + self.to_sql()
        self._cursor.execute(query)
        return from_sql_table(self._ic, table_name.to_sql())
Ejemplo n.º 21
0
def tmp_db():
    return _random_id('tmp_impyla_')
Ejemplo n.º 22
0
def tmp_db():
    return _random_id("tmp_impyla_")

from __future__ import absolute_import, print_function

import pytest

import impala.dbapi
from impala.tests.util import ImpylaTestEnv
from impala.util import (
    _random_id, force_drop_impala_database, force_drop_hive_database)
# must import the module, rather than the class, per comment in module
from impala.tests import _dbapi20_tests


ENV = ImpylaTestEnv()
tmp_db = _random_id('tmp_impyla_dbapi_')
hive = ENV.auth_mech == 'PLAIN'


@pytest.mark.connect
class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test):
    driver = impala.dbapi
    connect_kw_args = {'host': ENV.host,
                       'port': ENV.port,
                       'auth_mechanism': ENV.auth_mech,
                       'database': tmp_db}

    ddl1 = 'create table {0}booze (name string)'.format(
        _dbapi20_tests.DatabaseAPI20Test.table_prefix)
    ddl2 = 'create table {0}barflys (name string)'.format(
        _dbapi20_tests.DatabaseAPI20Test.table_prefix)
Ejemplo n.º 24
0
"""


from __future__ import absolute_import, print_function

import pytest

import impala.dbapi
from impala.tests.util import ImpylaTestEnv
from impala.util import _random_id, force_drop_database
# must import the module, rather than the class, per comment in module
from impala.tests import _dbapi20_tests


ENV = ImpylaTestEnv()
tmp_db = _random_id('tmp_impyla_dbapi_')


@pytest.mark.connect
class ImpalaDBAPI20Test(_dbapi20_tests.DatabaseAPI20Test):
    driver = impala.dbapi
    connect_kw_args = {'host': ENV.host,
                       'port': ENV.port,
                       'auth_mechanism': ENV.auth_mech,
                       'database': tmp_db}

    ddl1 = 'create table {0}booze (name string)'.format(
        _dbapi20_tests.DatabaseAPI20Test.table_prefix)
    ddl2 = 'create table {0}barflys (name string)'.format(
        _dbapi20_tests.DatabaseAPI20Test.table_prefix)
Ejemplo n.º 25
0
class ImpalaDecimalTests(unittest.TestCase):

    driver = impala.dbapi
    table_prefix = _random_id(prefix='dbapi20test_')
    tablename = table_prefix + 'decimaltests'

    def _connect(self):
        try:
            return self.driver.connect(**connect_kw_args)
        except AttributeError:
            self.fail("No connect method found in self.driver module")

    def setUp(self):
        ddl = """
            CREATE TABLE {0} (
              f1 decimal(10, 2),
              f2 decimal(7, 5),
              f3 decimal(38, 17))
        """.format(self.tablename)
        con = self._connect()
        try:
            cur = con.cursor()
            cur.execute(ddl)
            con.commit()
        except:
            raise
        finally:
            con.close()

    def tearDown(self):
        con = self._connect()
        try:
            cur = con.cursor()
            cur.execute('drop table {0}'.format(self.tablename))
            con.commit()
        except:
            raise
        finally:
            con.close()

    def test_cursor_description_precision_scale(self):
        # According to the DBAPI 2.0, these are the 7 fields of the cursor
        # description
        # - name
        # - type_code
        # - display_size
        # - internal_size
        # - precision
        # - scale
        # - null_ok
        cases = [(10, 2), (7, 5), (38, 17)]

        con = self._connect()
        try:
            cur = con.cursor()
            cur.execute('select * from {0} limit 0'.format(self.tablename))

            desc = cur.description
            for (ex_p, ex_s), val in zip(cases, desc):
                assert val[4] == ex_p
                assert val[5] == ex_s

            con.commit()
        finally:
            con.close()