Example #1
0
def test_complex_predicate_results():
    data = io.BytesIO()
    with Writer(data, "struct<c0:int,c1:string>",
                row_index_stride=100) as writer:
        writer.writerows((i, "A") if i > 300 and i <= 450 else (i, "B")
                         for i in range(1000))
    data.seek(0)
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") < 100)
        & (PredicateColumn(TypeKind.STRING, "c1") == "A"),
    )
    assert list(reader) == []
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") > 300)
        & (PredicateColumn(TypeKind.STRING, "c1") == "A"),
    )
    result = list(reader)
    assert len(result) == 200
    assert sum(1 if row[1] == "A" else 0 for row in result) == 150
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") >= 400)
        & (PredicateColumn(TypeKind.STRING, "c1") != "A"),
    )
    result = list(reader)
    assert len(result) == 600
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") < 100)
        | (PredicateColumn(TypeKind.STRING, index=2) != "B"),
    )
    result = list(reader)
    assert len(result) == 300
Example #2
0
def test_context_manager():
    data = io.BytesIO()
    records = [
        {
            "col0": 1,
            "col1": "Test A",
            "col2": 2.13
        },
        {
            "col0": 2,
            "col1": "Test B",
            "col2": 0.123213
        },
        {
            "col0": 3,
            "col1": "Test C",
            "col2": 123.011234
        },
    ]
    with Writer(data,
                "struct<col0:int,col1:string,col2:double>",
                struct_repr=StructRepr.DICT) as writer:
        for rec in records:
            writer.write(rec)
    data.seek(0)
    reader = Reader(data, struct_repr=StructRepr.DICT)
    assert reader.read() == records
Example #3
0
def test_init(orc_data):
    with pytest.raises(TypeError):
        _ = Reader(0)
    with pytest.raises(TypeError):
        _ = Reader(orc_data(1), "fail")
    reader = Reader(orc_data(2), 1)
    assert reader is not None
Example #4
0
def test_write_complex_type(orc_type, values):
    data = io.BytesIO()
    writer = Writer(data, orc_type, struct_repr=StructRepr.DICT)
    for rec in values:
        writer.write(rec)
    writer.close()

    data.seek(0)
    reader = Reader(data, struct_repr=StructRepr.DICT)
    assert reader.read() == values
Example #5
0
def test_compression(kind):
    data = io.BytesIO()
    with Writer(data, "int", compression=kind) as writer:
        writer.writerows(range(10))
    reader = Reader(data)
    with pytest.raises(AttributeError):
        reader.compression = "fail"
    with pytest.raises(AttributeError):
        del reader.compression
    assert reader.compression == kind
Example #6
0
def test_writer_id():
    data = io.BytesIO()
    with Writer(data, "int") as writer:
        writer.writerows(range(10))
    reader = Reader(data)
    with pytest.raises(AttributeError):
        reader.writer_id = "fail"
    with pytest.raises(AttributeError):
        del reader.writer_id
    assert reader.writer_id == "ORC_CPP_WRITER"
Example #7
0
def test_write():
    data = io.BytesIO()
    writer = Writer(data, "struct<col0:int,col1:string,col2:double>")
    records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)]
    for rec in records:
        writer.write(rec)
    writer.close()
    data.seek(0)
    reader = Reader(data)
    assert reader.read() == records
Example #8
0
def test_include():
    data = io.BytesIO()
    record = {"col0": 1, "col1": "Test A", "col2": 3.14}
    with Writer(data,
                "struct<col0:int,col1:string,col2:double>",
                struct_repr=StructRepr.DICT) as writer:
        writer.write(record)
    data.seek(0)
    reader = Reader(data, column_indices=[0], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1}
    reader = Reader(data, column_indices=[0, 2], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1, "col2": 3.14}
    with pytest.raises(TypeError):
        _ = Reader(data, column_indices=[0, "2"], struct_repr=StructRepr.DICT)
    reader = Reader(data, column_names=["col0"], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1}
    reader = Reader(data,
                    column_names=["col1", "col2"],
                    struct_repr=StructRepr.DICT)
    assert next(reader) == {"col1": "Test A", "col2": 3.14}
    with pytest.raises(TypeError):
        _ = Reader(data, column_names=["col1", 2], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data, column_indices=[10], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data, column_names=["col5"], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data,
                   column_names=["col1"],
                   column_indices=[2],
                   struct_repr=StructRepr.DICT)
Example #9
0
def test_writerows():
    data = io.BytesIO()
    writer = Writer(data, "int")
    rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
    res = writer.writerows(rows)
    writer.close()
    assert res == len(rows)

    data.seek(0)
    reader = Reader(data)
    assert list(rows) == reader.read()
Example #10
0
def test_next():
    data = io.BytesIO()
    Writer(data, "struct<col0:int,col1:string>").close()
    with pytest.raises(StopIteration):
        reader = Reader(data)
        next(reader)
    expected = (0, "Test A")
    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        writer.write(expected)
    reader = Reader(data)
    assert next(reader) == expected
    with pytest.raises(StopIteration):
        next(reader)
Example #11
0
def test_current_row(orc_data):
    reader = Reader(orc_data(20))
    assert reader.current_row == 0
    for _ in range(10):
        _ = next(reader)
    assert reader.current_row == 10
    res = next(reader)
    assert reader.current_row == 11
    assert res[0] == 10
    _ = [_ for _ in reader]
    assert reader.current_row == len(reader)

    with pytest.raises(AttributeError):
        reader.current_row = "fail"
    with pytest.raises(AttributeError):
        del reader.current_row
Example #12
0
def test_schema():
    schema_str = "struct<col0:int,col1:string>"
    data = io.BytesIO()
    Writer(data, schema_str).close()
    reader = Reader(data)

    assert str(reader.schema) == schema_str
    with pytest.raises(AttributeError):
        reader.schema = "fail"
    with pytest.raises(AttributeError):
        del reader.schema

    schema = reader.schema
    del reader
    assert isinstance(schema, TypeDescription)
    assert schema.kind == TypeKind.STRUCT
Example #13
0
def test_statistics_double(striped_orc_data):
    data = striped_orc_data("double", (i * 0.1 for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["kind"] == TypeKind.DOUBLE
    assert stat["minimum"] == 0
    assert math.isclose(stat["maximum"], 6553.4)
    assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe)))
    stat = reader[0].statistics
    assert stat["minimum"] == 0
    assert math.isclose(stat["maximum"], 9999.9)
    assert stat["sum"] == sum(i * 0.1 for i in range(100000))
    assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
Example #14
0
def test_iter(orc_data):
    reader = Reader(orc_data(20))
    result = [row for row in reader]
    assert len(result) == 20
    assert (0, "Test A") == result[0]
    assert (19, "Test T") == result[-1]
    assert (12, "Test M") in result
Example #15
0
def test_statistics_string(striped_orc_data):
    data = striped_orc_data("string", ("Test String {0}".format(i + 1)
                                       for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["kind"] == TypeKind.STRING
    assert stat["number_of_values"] == 65535
    assert stat["total_length"] == sum(len(i) for i in stripe)
    assert stat["minimum"] == "Test String 1"
    assert stat["maximum"] == max(i for i in Stripe(reader, 0))
    stat = reader[0].statistics
    assert stat["maximum"] == max(i for i in reader)
    assert reader.read_stripe(
        1)[0].statistics["minimum"] == "Test String 100000"
Example #16
0
def test_len(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)

    assert len(reader) != len(stripe)
    assert len(stripe) == 65535
Example #17
0
def test_statistics_int(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["kind"] == TypeKind.INT
    assert stat["minimum"] == 0
    assert stat["maximum"] == 65534
    assert stat["sum"] == sum(i for i in range(len(stripe)))
    stat = reader[0].statistics
    assert stat["minimum"] == 0
    assert stat["maximum"] == 99999
    assert stat["sum"] == sum(i for i in range(100000))
    assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
Example #18
0
def test_bytes_length(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 1)

    assert stripe.bytes_length == 392  # Bold, hardcoded length value.
    with pytest.raises(AttributeError):
        stripe.bytes_length = "false"
Example #19
0
def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input,
                                  expected):
    data = io.BytesIO()
    with Writer(data, schema, timezone=writer_tz) as writer:
        writer.write((input, ))
    reader = Reader(data, timezone=reader_tz)
    output = next(reader)[0]
    assert output == expected
Example #20
0
def test_bloom_filter_columns(striped_orc_data):
    expected = (0, 1)
    data = striped_orc_data(
        "struct<col0:int,col1:string>",
        ((i, "Test {}".format(i + 1)) for i in range(100000)),
        bfc=expected,
    )
    reader = Reader(data)
    assert Stripe(reader, 0).bloom_filter_columns == expected
    assert Stripe(reader, 1).bloom_filter_columns == expected

    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    assert stripe.bloom_filter_columns == tuple()
    with pytest.raises(AttributeError):
        stripe.bloom_filter_columns = (0,)
Example #21
0
def test_bytes_offset(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 1)

    assert stripe.bytes_offset == 658  # Bold, hardcoded offset value.
    with pytest.raises(AttributeError):
        stripe.bytes_offset = 5
Example #22
0
def test_getitem(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    col = reader[0]
    assert col is not None
    col = stripe[0]
    assert col is not None
Example #23
0
def test_attributes(schema, attrs):
    data = io.BytesIO()
    schema.set_attributes(attrs)
    writer = Writer(data, schema)
    writer.close()
    reader = Reader(data)
    assert len(reader) == 0
    assert reader.schema.attributes == attrs
Example #24
0
def test_converting_predicate(orc_type, value):
    if orc_type.kind == TypeKind.DECIMAL:
        pred_col = PredicateColumn(orc_type.kind,
                                   "c0",
                                   precision=orc_type.precision,
                                   scale=orc_type.scale)
    else:
        pred_col = PredicateColumn(orc_type.kind, "c0")
    data = io.BytesIO()
    with Writer(data, f"struct<c0:{orc_type}>", row_index_stride=1) as writer:
        writer.write((value, ))
        writer.write((None, ))
    reader = Reader(data, predicate=(pred_col == value))
    result = list(reader)
    assert result == [(value, )]
    reader = Reader(data, predicate=(pred_col == None))
    result = list(reader)
    assert result == [(None, )]
Example #25
0
def test_selected_schema():
    schema_str = "struct<col0:int,col1:string>"
    data = io.BytesIO()
    Writer(data, schema_str).close()
    reader = Reader(data, column_names=("col1",))

    assert str(reader.schema) == schema_str
    assert str(reader.selected_schema) != str(reader.schema)
    with pytest.raises(AttributeError):
        reader.selected_schema = "fail"
    with pytest.raises(AttributeError):
        del reader.selected_schema

    schema = reader.selected_schema
    del reader
    assert isinstance(schema, typedescription)
    assert schema.kind == TypeKind.STRUCT
    assert str(schema) == "struct<col1:string>"
Example #26
0
def test_row_offset(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe0 = Stripe(reader, 0)

    assert stripe0.row_offset == 0
    assert Stripe(reader, 1).row_offset == len(stripe0)
    with pytest.raises(AttributeError):
        stripe0.row_offset = 5
Example #27
0
def test_wrong_predicate():
    data = io.BytesIO()
    with Writer(data, "struct<c0:int,c1:string>",
                row_index_stride=100) as writer:
        writer.writerows(
            (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000))
    data.seek(0)
    with pytest.raises(TypeError):
        reader = Reader(data, predicate="wrong")
Example #28
0
def test_bytes_lengths():
    data = io.BytesIO()
    Writer(data, "string", compression=0).close()
    reader = Reader(data)
    assert reader.bytes_lengths["content_length"] == 0
    assert reader.bytes_lengths["file_footer_length"] == 38
    assert reader.bytes_lengths["file_postscript_length"] == 23
    assert reader.bytes_lengths["file_length"] == 65
    assert reader.bytes_lengths["stripe_statistics_length"] == 0
    data = io.BytesIO()
    with Writer(data, "int") as writer:
        writer.writerows(range(100))
    reader = Reader(data)
    assert reader.bytes_lengths["content_length"] == 76
    assert reader.bytes_lengths["file_footer_length"] == 59
    assert reader.bytes_lengths["file_postscript_length"] == 23
    assert reader.bytes_lengths["file_length"] == len(data.getvalue())
    assert reader.bytes_lengths["stripe_statistics_length"] == 21
Example #29
0
def test_len():
    data = io.BytesIO()
    Writer(data, "struct<col0:int,col1:string>").close()
    reader = Reader(data)
    assert len(reader) == 0

    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        writer.write((0, "Test A"))
    reader = Reader(data)
    assert len(reader) == 1

    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        for i in range(10):
            writer.write((i, "Test"))
    reader = Reader(data)
    assert len(reader) == 10
Example #30
0
def test_converting_predicate_error():
    data = io.BytesIO()
    with Writer(data, f"struct<c0:string>", row_index_stride=1) as writer:
        writer.write(("test", ))
    with pytest.raises(TypeError):
        _ = Reader(
            data,
            predicate=(PredicateColumn(TypeKind.STRING) < "test"),
        )
    with pytest.raises(TypeError):
        _ = Reader(
            data,
            predicate=(PredicateColumn(TypeKind.STRING) == "test"),
        )
    with pytest.raises(TypeError):
        _ = Reader(
            data,
            predicate=(PredicateColumn(TypeKind.STRING) <= "test"),
        )