Ejemplo n.º 1
0
def test_non_existent_key():
    """Executes a select against a non existent key.

    :return: None
    """

    cur = PandasCursor(boto3.client('s3'))\
        .select('does-not-exist.csv', 'select * from S3Object')

    try:
        with pytest.raises(Exception):
            cur.execute()
    finally:
        cur.close()
Ejemplo n.º 2
0
def test_non_empty_results():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 5
    finally:
        cur.close()
Ejemplo n.º 3
0
def test_select_projected_filtered_topk_with_cursor():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    num_rows = 0

    cur = PandasCursor(s3) \
        .parquet() \
        .select('parquet/small.multicolumn.9999.parquet', 'select b from s3Object where a > 5000 limit 100')

    try:
        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)
            print("{}:{}".format(num_rows, df))

        assert num_rows == 100
    finally:
        cur.close()
Ejemplo n.º 4
0
def test_aggregate():
    """Executes a select with an aggregate.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('region.csv', 'select count(*) from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._0 == '5'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
Ejemplo n.º 5
0
def test_where_predicate():
    """Executes a select with a where clause on one of the attributes.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object where r_name = \'AMERICA\';')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._1 == 'AMERICA'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
Ejemplo n.º 6
0
def test_large_results():
    """Executes a select where a large number of records are expected

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:

        # pr = cProfile.Profile()
        # pr.enable()

        start = timeit.default_timer()

        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))
        end = timeit.default_timer()

        elapsed = end - start
        print('Elapsed {}'.format(elapsed))

        # pr.disable()
        # s = StringIO.StringIO()
        # sortby = 'cumulative'
        # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        # ps.print_stats()
        # print (s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()
Ejemplo n.º 7
0
def measure_R_scan():
    cur = PandasCursor(boto3.client('s3')) \
        .select('access_method_benchmark/10-shards-10GB/data_0.csv',
                'select * from S3Object where cast(F0 AS float) < 0.01; ')

    end_times = []
    start = timeit.default_timer()
    for i in range(3):
        dfs = cur.execute()
        for df in dfs:
            pass
        end_times.append(timeit.default_timer())

    end = timeit.default_timer()

    for n in range(3):
        print("trial {} takes {}".format(
            n, end_times[n] - start if n == 0 else end_times[n] -
            end_times[n - 1]))
    print("{} bytes scanned".format(cur.bytes_scanned))
    print("time = {}".format(end - start))
    print("R_scan = {}".format(1.0 * cur.bytes_scanned / (end - start)))

    cur.close()
Ejemplo n.º 8
0
def test_no_ssl():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    cur = PandasCursor(s3)\
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    cur = PandasCursor(s3) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()