def test_from_pandas_in_query(ic): df1 = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')}) bdf = from_pandas(ic, df1, method='in_query') df2 = bdf.collect() assert tuple(df2.columns) == ('a', 'b') assert df2.shape == df1.shape assert all(df1 == df2)
def test_from_pandas_webhdfs(ic): df1 = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')}) path = os.path.join(ic._temp_dir, 'test_pandas_webhdfs_dir') bdf = from_pandas(ic, df1, method='webhdfs', path=path) df2 = bdf.collect() assert tuple(df2.columns) == ('a', 'b') assert df2.shape == df1.shape assert all(df1 == df2)
def test_context_cleanup(host, port, protocol, use_kerberos, ic, hdfs_client): # create a *new* ImpalaContext ctx = ImpalaContext(temp_dir=None, temp_db=None, nn_host=ic._nn_host, webhdfs_port=ic._webhdfs_port, hdfs_user=ic._hdfs_user, host=host, port=port, protocol=protocol, use_kerberos=use_kerberos) # check that the database was created ctx._cursor.execute('SHOW DATABASES') databases = [tup[0].lower() for tup in ctx._cursor] assert ctx._temp_db.lower() in databases # check that a bdf table created with the ic is represented in the database df = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')}) bdf = from_pandas(ctx, df, method='in_query') # hack to get the table name: I know that this table was generated # ultimately with a `from_sql_table`, so I just reach into the AST to # figure it out. table_name = bdf._query_ast._from.name.split('.')[-1] ctx._cursor.execute('USE %s' % ctx._temp_db) ctx._cursor.execute('SHOW TABLES') tables = [tup[0].lower() for tup in ctx._cursor] assert table_name.lower() in tables # check that the temporary directory was created # (raises FileNotFound on failure) assert hdfs_client.status(ctx._temp_dir) # check that the corresponding data file has the correct size table_path = os.path.join(ctx._temp_dir, table_name) sizes = [ s['length'] for (_, s) in hdfs_client.list(table_path) if s['type'] == 'FILE' ] assert len(sizes) == 1 assert sizes[0] == 20 ctx.close() # check that the temp database was dropped ctx._cursor.execute('SHOW DATABASES') databases = [tup[0].lower() for tup in ctx._cursor] assert ctx._temp_db.lower() not in databases # check that the temp dir was deleted # I know this is importable because this test depends on hdfs_client, which # skips if hdfs is not available from hdfs.util import HdfsError with pytest.raises(HdfsError): assert hdfs_client.status(ctx._temp_dir)
def test_context_cleanup(host, port, protocol, ic, hdfs_client): # create a *new* ImpalaContext ctx = ImpalaContext(temp_dir=None, temp_db=None, nn_host=ic._nn_host, webhdfs_port=ic._webhdfs_port, hdfs_user=ic._hdfs_user, host=host, port=port, protocol=protocol) # check that the database was created ctx._cursor.execute('SHOW DATABASES') databases = [tup[0].lower() for tup in ctx._cursor] assert ctx._temp_db.lower() in databases # check that a bdf table created with the ic is represented in the database df = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')}) bdf = from_pandas(ctx, df, method='in_query') # hack to get the table name: I know that this table was generated # ultimately with a `from_sql_table`, so I just reach into the AST to figure # it out. table_name = bdf._query_ast._from.name.split('.')[-1] ctx._cursor.execute('USE %s' % ctx._temp_db) ctx._cursor.execute('SHOW TABLES') tables = [tup[0].lower() for tup in ctx._cursor] assert table_name.lower() in tables # check that the temporary directory was created # (raises FileNotFound on failure) assert hdfs_client.get_file_dir_status(ctx._temp_dir.lstrip('/')) # check that the corresponding data file has the correct size listing = hdfs_client.list_dir( os.path.join(ctx._temp_dir, table_name).lstrip('/')) statuses = listing['FileStatuses']['FileStatus'] sizes = [s['length'] for s in statuses if s['type'] == 'FILE'] assert len(sizes) == 1 assert sizes[0] == 20 ctx.close() # check that the temp database was dropped ctx._cursor.execute('SHOW DATABASES') databases = [tup[0].lower() for tup in ctx._cursor] assert ctx._temp_db.lower() not in databases # check that the temp dir was deleted # I know this is importable because this test depends on hdfs_client, which # skips if pywebhdfs is not available from pywebhdfs.errors import FileNotFound with pytest.raises(FileNotFound): assert hdfs_client.get_file_dir_status(ctx._temp_dir.lstrip('/'))