Example #1
0
def test_from_pandas_in_query(ic):
    df1 = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')})
    bdf = from_pandas(ic, df1, method='in_query')
    df2 = bdf.collect()
    assert tuple(df2.columns) == ('a', 'b')
    assert df2.shape == df1.shape
    assert all(df1 == df2)
Example #2
0
def test_from_pandas_in_query(ic):
    df1 = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')})
    bdf = from_pandas(ic, df1, method='in_query')
    df2 = bdf.collect()
    assert tuple(df2.columns) == ('a', 'b')
    assert df2.shape == df1.shape
    assert all(df1 == df2)
Example #3
0
def test_from_pandas_webhdfs(ic):
    df1 = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')})
    path = os.path.join(ic._temp_dir, 'test_pandas_webhdfs_dir')
    bdf = from_pandas(ic, df1, method='webhdfs', path=path)
    df2 = bdf.collect()
    assert tuple(df2.columns) == ('a', 'b')
    assert df2.shape == df1.shape
    assert all(df1 == df2)
Example #4
0
def test_context_cleanup(host, port, protocol, use_kerberos, ic, hdfs_client):
    # create a *new* ImpalaContext
    ctx = ImpalaContext(temp_dir=None,
                        temp_db=None,
                        nn_host=ic._nn_host,
                        webhdfs_port=ic._webhdfs_port,
                        hdfs_user=ic._hdfs_user,
                        host=host,
                        port=port,
                        protocol=protocol,
                        use_kerberos=use_kerberos)

    # check that the database was created
    ctx._cursor.execute('SHOW DATABASES')
    databases = [tup[0].lower() for tup in ctx._cursor]
    assert ctx._temp_db.lower() in databases

    # check that a bdf table created with the ic is represented in the database
    df = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')})
    bdf = from_pandas(ctx, df, method='in_query')
    # hack to get the table name: I know that this table was generated
    # ultimately with a `from_sql_table`, so I just reach into the AST to
    # figure it out.
    table_name = bdf._query_ast._from.name.split('.')[-1]
    ctx._cursor.execute('USE %s' % ctx._temp_db)
    ctx._cursor.execute('SHOW TABLES')
    tables = [tup[0].lower() for tup in ctx._cursor]
    assert table_name.lower() in tables

    # check that the temporary directory was created
    # (raises FileNotFound on failure)
    assert hdfs_client.status(ctx._temp_dir)

    # check that the corresponding data file has the correct size
    table_path = os.path.join(ctx._temp_dir, table_name)
    sizes = [
        s['length'] for (_, s) in hdfs_client.list(table_path)
        if s['type'] == 'FILE'
    ]
    assert len(sizes) == 1
    assert sizes[0] == 20

    ctx.close()

    # check that the temp database was dropped
    ctx._cursor.execute('SHOW DATABASES')
    databases = [tup[0].lower() for tup in ctx._cursor]
    assert ctx._temp_db.lower() not in databases

    # check that the temp dir was deleted
    # I know this is importable because this test depends on hdfs_client, which
    # skips if hdfs is not available
    from hdfs.util import HdfsError
    with pytest.raises(HdfsError):
        assert hdfs_client.status(ctx._temp_dir)
Example #5
0
def test_context_cleanup(host, port, protocol, ic, hdfs_client):
    # create a *new* ImpalaContext
    ctx = ImpalaContext(temp_dir=None, temp_db=None, nn_host=ic._nn_host,
            webhdfs_port=ic._webhdfs_port, hdfs_user=ic._hdfs_user, host=host,
            port=port, protocol=protocol)
    
    # check that the database was created
    ctx._cursor.execute('SHOW DATABASES')
    databases = [tup[0].lower() for tup in ctx._cursor]
    assert ctx._temp_db.lower() in databases
    
    # check that a bdf table created with the ic is represented in the database
    df = pd.DataFrame({'a': (1, 2, 5), 'b': ('foo', 'bar', 'pasta')})
    bdf = from_pandas(ctx, df, method='in_query')
    # hack to get the table name: I know that this table was generated
    # ultimately with a `from_sql_table`, so I just reach into the AST to figure
    # it out.
    table_name = bdf._query_ast._from.name.split('.')[-1]
    ctx._cursor.execute('USE %s' % ctx._temp_db)
    ctx._cursor.execute('SHOW TABLES')
    tables = [tup[0].lower() for tup in ctx._cursor]
    assert table_name.lower() in tables
    
    # check that the temporary directory was created
    # (raises FileNotFound on failure)
    assert hdfs_client.get_file_dir_status(ctx._temp_dir.lstrip('/'))
    
    # check that the corresponding data file has the correct size
    listing = hdfs_client.list_dir(
            os.path.join(ctx._temp_dir, table_name).lstrip('/'))
    statuses = listing['FileStatuses']['FileStatus']
    sizes = [s['length'] for s in statuses if s['type'] == 'FILE']
    assert len(sizes) == 1
    assert sizes[0] == 20
    
    ctx.close()
    
    # check that the temp database was dropped
    ctx._cursor.execute('SHOW DATABASES')
    databases = [tup[0].lower() for tup in ctx._cursor]
    assert ctx._temp_db.lower() not in databases
    
    # check that the temp dir was deleted
    # I know this is importable because this test depends on hdfs_client, which
    # skips if pywebhdfs is not available
    from pywebhdfs.errors import FileNotFound
    with pytest.raises(FileNotFound):
        assert hdfs_client.get_file_dir_status(ctx._temp_dir.lstrip('/'))