Ejemplo n.º 1
0
def test_s3_select_b_filtered_from_small_multicolumn_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/small.multicolumn.9999.parquet',
                                        Expression='select a from s3Object where a < 5000', ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}

                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 5000

    print()
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_scanned))
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_processed))
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_returned))
Ejemplo n.º 2
0
def test_filtered_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                        Expression='select * from s3Object where cast(s_acctbal as float) > 500.0 ',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}
                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert_supplier_table(df, 8642, use_ordinal_columns=True)
Ejemplo n.º 3
0
    def run():
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10)
        session = Session()
        s3 = session.client('s3', config=cfg)
        response = s3.select_object_content(
            Bucket='s3filter',
            Key='parquet/supplier.large.parquet',
            Expression='select sum(cast(s_suppkey as int)) from s3Object',
            ExpressionType='SQL',
            InputSerialization={
                'CompressionType': 'NONE',
                'Parquet': {}
            },
            OutputSerialization={'CSV': {}})
        df = None
        cursor = PandasCursor(None)
        cursor.event_stream = response['Payload']
        dfs = cursor.parse_event_stream()
        for partial_df in dfs:
            if df is None:
                df = partial_df
            else:
                df = pd.concat(df, partial_df)
        assert len(df) == 1
        # assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(22551774325.00404)

        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_scanned))
        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_processed))
        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_returned))
Ejemplo n.º 4
0
def test_s3_select_from_csv():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.csv',
                                        Expression='select sum(cast(s_acctbal as float)) from s3Object',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'CSV': {'FileHeaderInfo': 'Use', 'RecordDelimiter': '\n',
                                                    'FieldDelimiter': '|'}
                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 1

    assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(45103548.64999)
Ejemplo n.º 5
0
def test_non_empty_results():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 5
    finally:
        cur.close()
Ejemplo n.º 6
0
def test_select_projected_filtered_topk_with_cursor():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    num_rows = 0

    cur = PandasCursor(s3) \
        .parquet() \
        .select('parquet/small.multicolumn.9999.parquet', 'select b from s3Object where a > 5000 limit 100')

    try:
        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)
            print("{}:{}".format(num_rows, df))

        assert num_rows == 100
    finally:
        cur.close()
Ejemplo n.º 7
0
def test_where_predicate():
    """Executes a select with a where clause on one of the attributes.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object where r_name = \'AMERICA\';')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._1 == 'AMERICA'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
Ejemplo n.º 8
0
def test_aggregate():
    """Executes a select with an aggregate.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('region.csv', 'select count(*) from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._0 == '5'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
Ejemplo n.º 9
0
def test_projected_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                        Expression='select s_suppkey from s3Object',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}

                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 10000

    rows = df[df['_0'] == '1']

    assert len(rows) == 1

    row = rows.iloc[0]

    assert row['_0'] == "1"
Ejemplo n.º 10
0
def test_large_results():
    """Executes a select where a large number of records are expected

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:

        # pr = cProfile.Profile()
        # pr.enable()

        start = timeit.default_timer()

        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))
        end = timeit.default_timer()

        elapsed = end - start
        print('Elapsed {}'.format(elapsed))

        # pr.disable()
        # s = StringIO.StringIO()
        # sortby = 'cumulative'
        # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        # ps.print_stats()
        # print (s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()
Ejemplo n.º 11
0
    def execute_pandas_query(op):
        cur = PandasCursor(op.s3).select(op.s3key, op.s3sql)
        dfs = cur.execute()
        op.op_metrics.query_bytes = cur.query_bytes
        op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
        first_tuple = True
        for df in dfs:

            assert (len(df) > 0)

            if first_tuple:
                assert (len(df.columns.values) > 0)
                op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                first_tuple = False

                if op.log_enabled:
                    print("{}('{}') | Sending field names: {}".format(
                        op.__class__.__name__, op.name, df.columns.values))

            op.op_metrics.rows_returned += len(df)

            op.send(df, op.consumers)
        return cur
Ejemplo n.º 12
0
def measure_R_scan():
    cur = PandasCursor(boto3.client('s3')) \
        .select('access_method_benchmark/10-shards-10GB/data_0.csv',
                'select * from S3Object where cast(F0 AS float) < 0.01; ')

    end_times = []
    start = timeit.default_timer()
    for i in range(3):
        dfs = cur.execute()
        for df in dfs:
            pass
        end_times.append(timeit.default_timer())

    end = timeit.default_timer()

    for n in range(3):
        print("trial {} takes {}".format(
            n, end_times[n] - start if n == 0 else end_times[n] -
            end_times[n - 1]))
    print("{} bytes scanned".format(cur.bytes_scanned))
    print("time = {}".format(end - start))
    print("R_scan = {}".format(1.0 * cur.bytes_scanned / (end - start)))

    cur.close()
Ejemplo n.º 13
0
def test_non_existent_key():
    """Executes a select against a non existent key.

    :return: None
    """

    cur = PandasCursor(boto3.client('s3'))\
        .select('does-not-exist.csv', 'select * from S3Object')

    try:
        with pytest.raises(Exception):
            cur.execute()
    finally:
        cur.close()
Ejemplo n.º 14
0
def test_no_ssl():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    cur = PandasCursor(s3)\
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    cur = PandasCursor(s3) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()
Ejemplo n.º 15
0
def run(use_pandas, secure, use_native):
    print("Cursor | Settings {}".format({
        'use_pandas': use_pandas,
        'secure': secure,
        'use_native': use_native
    }))

    key = 'sf1000-lineitem/lineitem_27.csv'
    sql = 'select * from S3Object limit 10000'

    boto3.set_stream_logger('', logging.DEBUG)

    if secure:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10)
        session = Session()
        s3 = session.client('s3', config=cfg)
    else:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10,
                     s3={'payload_signing_enabled': False})
        session = Session()
        s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    if use_pandas and not use_native:

        cur = PandasCursor(s3).select(key, sql)
        dfs = cur.execute()

        rows_returned = 0
        for df in dfs:
            rows_returned += len(df)

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    elif use_pandas and use_native:

        closure = {'df': None, 'rows_returned': 0}

        def on_data(data):

            # print("|||")
            # print(type(data))
            # print(data)
            # print("|||")

            # if closure['df'] is None:
            #     closure['df'] = pd.DataFrame(data)
            # else:
            #     closure['df'] = pd.concat([closure['df'], pd.DataFrame(data)], ignore_index=True, copy=False)

            closure['rows_returned'] += len(pd.DataFrame(data))

        cur = NativeCursor(scan).select(key, sql)
        cur.execute(on_data)

        rows_returned = closure['rows_returned']

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    else:

        timer = Timer()
        timer.start()

        cur = Cursor(s3).select(key, sql)
        rows = cur.execute()
        for _ in rows:
            pass

        print(timer.elapsed())

    print("Cursor | Done")
Ejemplo n.º 16
0
    def execute_pandas_query(op):

        if op.use_native:

            closure = {'first_tuple': True}

            def on_numpy_array(np_array):

                df = pd.DataFrame(np_array)

                if closure['first_tuple']:
                    assert (len(df.columns.values) > 0)
                    op.send(TupleMessage(Tuple(df.columns.values)),
                            op.consumers)
                    closure['first_tuple'] = False

                    if op.log_enabled:
                        print("{}('{}') | Sending field names: {}".format(
                            op.__class__.__name__, op.name, df.columns.values))

                op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time(
                )
                op.op_metrics.rows_returned += len(df)

                if op.log_enabled:
                    print("{}('{}') | Sending field values:".format(
                        op.__class__.__name__, op.name))
                    print(df)

                op.send(df, op.consumers)

            # cur = NativeCursor(op.fast_s3).select(op.s3key, op.s3sql)
            # cur.execute(on_numpy_array)
            #
            # op.op_metrics.query_bytes = cur.query_bytes
            #
            # return cur
        else:
            if op.format_ is Format.CSV:
                cur = PandasCursor(op.s3).csv().select(op.s3key, op.s3sql)
            elif op.format_ is Format.PARQUET:
                cur = PandasCursor(op.s3).parquet().select(op.s3key, op.s3sql)
            else:
                raise Exception("Unrecognized format {}", op.format_)

            dfs = cur.execute()
            op.op_metrics.query_bytes = cur.query_bytes
            op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
            first_tuple = True

            counter = 0

            buffer_ = pd.DataFrame()
            #print("DataFrames: ")
            #print(dfs)
            for df in dfs:
                if op.fn:
                    df = op.fn(df)

                if first_tuple:
                    # assert (len(df.columns.values) > 0)
                    # op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                    first_tuple = False

                    if op.log_enabled:
                        print("{}('{}') | Sending field names: {}".format(
                            op.__class__.__name__, op.name, df.columns.values))

                op.op_metrics.rows_returned += len(df)

                # Apply filter if there is one
                # if op.filter_fn is not None:
                #    df = df[op.filter_fn(df)]

                # if op.log_enabled:
                #     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                #         print("{}('{}') | Sending field values: \n{}".format(op.__class__.__name__, op.name, df))

                counter += 1
                if op.log_enabled:
                    sys.stdout.write('.')
                    if counter % 100 == 0:
                        print("Rows {}".format(op.op_metrics.rows_returned))

                op.send(DataFrameMessage(df), op.consumers)

                # buffer_ = pd.concat([buffer_, df], axis=0, sort=False, ignore_index=True, copy=False)
                # if len(buffer_) >= 8192:
                #    op.send(buffer_, op.consumers)
                #    buffer_ = pd.DataFrame()

            if len(buffer_) > 0:
                op.send(buffer_, op.consumers)
                del buffer_

            return cur
Ejemplo n.º 17
0
def test_projected_vs_all_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response1 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                         Expression='select s_suppkey from s3Object',
                                         ExpressionType='SQL',
                                         InputSerialization={
                                             'CompressionType': 'NONE',
                                             'Parquet': {}

                                         },
                                         OutputSerialization={
                                             'CSV': {}
                                         })

    df1 = None

    cursor1 = PandasCursor(None)
    cursor1.event_stream = response1['Payload']
    dfs1 = cursor1.parse_event_stream()

    for partial_df1 in dfs1:
        if df1 is None:
            df1 = partial_df1
        else:
            df1 = pd.concat(df1, partial_df1)

    response2 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                         Expression='select * from s3Object',
                                         ExpressionType='SQL',
                                         InputSerialization={
                                             'CompressionType': 'NONE',
                                             'Parquet': {}

                                         },
                                         OutputSerialization={
                                             'CSV': {}
                                         })

    df2 = None

    cursor2 = PandasCursor(None)
    cursor2.event_stream = response2['Payload']
    dfs2 = cursor2.parse_event_stream()

    for partial_df2 in dfs2:
        if df2 is None:
            df2 = partial_df2
        else:
            df2 = pd.concat(df2, partial_df2)

    assert len(df1) == 10000
    assert len(df2) == 10000

    rows1 = df1[df1['_0'] == '1']
    rows2 = df2[df2['_0'] == '1']

    assert len(rows1) == 1
    assert len(rows2) == 1

    row1 = rows1.iloc[0]
    row2 = rows2.iloc[0]

    assert row1['_0'] == "1"
    assert row2['_0'] == "1"

    assert cursor2.bytes_scanned > cursor1.bytes_scanned