Example #1
0
def test_write_wrong_complex_type(orc_type, value):
    data = io.BytesIO()
    writer = Writer(data, orc_type)
    with pytest.raises(
        (TypeError, ValueError)
    ):  # Dict construction might raise ValueError as well.
        writer.write(value)
Example #2
0
def test_attributes(schema, attrs):
    data = io.BytesIO()
    schema.set_attributes(attrs)
    writer = Writer(data, schema)
    writer.close()
    reader = Reader(data)
    assert len(reader) == 0
    assert reader.schema.attributes == attrs
Example #3
0
def test_struct_repr():
    data = io.BytesIO()
    writer = Writer(data, "struct<a:int>")
    with pytest.raises(TypeError):
        writer.write({"a": 1})
    writer = Writer(data, "struct<a:int>", struct_repr=StructRepr.DICT)
    with pytest.raises(TypeError):
        writer.write((1,))
    with pytest.raises(TypeError):
        writer.write({"a": "b"})
Example #4
0
def test_writerows():
    data = io.BytesIO()
    writer = Writer(data, "int")
    rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
    res = writer.writerows(rows)
    writer.close()
    assert res == len(rows)

    data.seek(0)
    reader = Reader(data)
    assert list(rows) == reader.read()
Example #5
0
def test_open_file():
    with tempfile.NamedTemporaryFile(mode="wt") as fp:
        with pytest.raises(ParseError):
            _ = Writer(fp, "int")
        with open(fp.name, "rb") as fp2:
            with pytest.raises(io.UnsupportedOperation):
                _ = Writer(fp2, "int")
    with tempfile.NamedTemporaryFile(mode="wb") as fp:
        writer = Writer(fp, "int")
        assert isinstance(writer, Writer)
    with pytest.raises(TypeError):
        _ = Writer(0, "int")
Example #6
0
def test_open_file(output_file):
    output_file.close()
    with open(output_file.name, mode="wt") as fp:
        with pytest.raises(ParseError):
            _ = Writer(fp, "int")
    with open(output_file.name, "rb") as fp:
        with pytest.raises(io.UnsupportedOperation):
            _ = Writer(fp, "int")
    with open(output_file.name, mode="wb") as fp:
        writer = Writer(fp, "int")
        assert isinstance(writer, Writer)
    with pytest.raises(TypeError):
        _ = Writer(0, "int")
Example #7
0
def test_next():
    data = io.BytesIO()
    Writer(data, "struct<col0:int,col1:string>").close()
    with pytest.raises(StopIteration):
        reader = Reader(data)
        next(reader)
    expected = (0, "Test A")
    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        writer.write(expected)
    reader = Reader(data)
    assert next(reader) == expected
    with pytest.raises(StopIteration):
        next(reader)
Example #8
0
def test_schema():
    schema_str = "struct<col0:int,col1:string>"
    data = io.BytesIO()
    writer = Writer(data, schema_str)

    assert str(writer.schema) == schema_str
    with pytest.raises(AttributeError):
        writer.schema = "fail"
    with pytest.raises(AttributeError):
        del writer.schema

    schema = writer.schema
    del writer
    assert isinstance(schema, TypeDescription)
    assert schema.kind == TypeKind.STRUCT
Example #9
0
def test_context_manager():
    data = io.BytesIO()
    records = [
        {
            "col0": 1,
            "col1": "Test A",
            "col2": 2.13
        },
        {
            "col0": 2,
            "col1": "Test B",
            "col2": 0.123213
        },
        {
            "col0": 3,
            "col1": "Test C",
            "col2": 123.011234
        },
    ]
    with Writer(data,
                "struct<col0:int,col1:string,col2:double>",
                struct_repr=StructRepr.DICT) as writer:
        for rec in records:
            writer.write(rec)
    data.seek(0)
    reader = Reader(data, struct_repr=StructRepr.DICT)
    assert reader.read() == records
Example #10
0
def test_open_file(output_file):
    output_file.close()
    with open(output_file.name, "wb") as fp:
        with pytest.raises(ParseError):
            _ = Reader(fp)
        # Write invalid bytes:
        fp.write(b"TESTTORC\x08\x03\x10\x03")
    with open(output_file.name, "rb") as fp:
        with pytest.raises(ParseError):
            _ = Reader(fp)
    with open(output_file.name, "wb") as fp:
        fp.write(b'ORC\x08\x03\x10\x03"k\x08\x0c\x12\x0c\x01\x02\x03')
    with open(output_file.name, "rt") as fp:
        with pytest.raises(ParseError):
            _ = Reader(fp)
    with open(output_file.name, "rb") as fp:
        with pytest.raises(ParseError):
            _ = Reader(fp)
    with open(output_file.name, "wb") as fp:
        Writer(fp, "struct<col0:int,col1:string>").close()
    with open(output_file.name, "ab") as fp:
        with pytest.raises(io.UnsupportedOperation):
            _ = Reader(fp)
    with open(output_file.name, "rb") as fp:
        reader = Reader(fp)
        assert reader is not None
        assert len(reader) == 0
Example #11
0
def test_include():
    data = io.BytesIO()
    record = {"col0": 1, "col1": "Test A", "col2": 3.14}
    with Writer(data,
                "struct<col0:int,col1:string,col2:double>",
                struct_repr=StructRepr.DICT) as writer:
        writer.write(record)
    data.seek(0)
    reader = Reader(data, column_indices=[0], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1}
    reader = Reader(data, column_indices=[0, 2], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1, "col2": 3.14}
    with pytest.raises(TypeError):
        _ = Reader(data, column_indices=[0, "2"], struct_repr=StructRepr.DICT)
    reader = Reader(data, column_names=["col0"], struct_repr=StructRepr.DICT)
    assert next(reader) == {"col0": 1}
    reader = Reader(data,
                    column_names=["col1", "col2"],
                    struct_repr=StructRepr.DICT)
    assert next(reader) == {"col1": "Test A", "col2": 3.14}
    with pytest.raises(TypeError):
        _ = Reader(data, column_names=["col1", 2], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data, column_indices=[10], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data, column_names=["col5"], struct_repr=StructRepr.DICT)
    with pytest.raises(ValueError):
        _ = Reader(data,
                   column_names=["col1"],
                   column_indices=[2],
                   struct_repr=StructRepr.DICT)
Example #12
0
def test_complex_predicate_results():
    data = io.BytesIO()
    with Writer(data, "struct<c0:int,c1:string>",
                row_index_stride=100) as writer:
        writer.writerows((i, "A") if i > 300 and i <= 450 else (i, "B")
                         for i in range(1000))
    data.seek(0)
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") < 100)
        & (PredicateColumn(TypeKind.STRING, "c1") == "A"),
    )
    assert list(reader) == []
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") > 300)
        & (PredicateColumn(TypeKind.STRING, "c1") == "A"),
    )
    result = list(reader)
    assert len(result) == 200
    assert sum(1 if row[1] == "A" else 0 for row in result) == 150
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") >= 400)
        & (PredicateColumn(TypeKind.STRING, "c1") != "A"),
    )
    result = list(reader)
    assert len(result) == 600
    reader = Reader(
        data,
        predicate=(PredicateColumn(TypeKind.INT, "c0") < 100)
        | (PredicateColumn(TypeKind.STRING, index=2) != "B"),
    )
    result = list(reader)
    assert len(result) == 300
Example #13
0
def test_open_file():
    with tempfile.NamedTemporaryFile(mode="wb") as fp:
        with pytest.raises(ParseError):
            _ = Reader(fp)
        fp.write(b"TESTTORC\x08\x03\x10\x03")
        fp.flush()
        fp.seek(0)
        with open(fp.name, "rb") as fp2:
            with pytest.raises(ParseError):
                _ = Reader(fp2)
        fp.write(b'ORC\x08\x03\x10\x03"k\x08\x0c\x12\x0c\x01\x02\x03')
        fp.flush()
        fp.seek(0)
        with open(fp.name, "rt") as fp2:
            with pytest.raises(ParseError):
                _ = Reader(fp2)
        with open(fp.name, "rb") as fp2:
            with pytest.raises(ParseError):
                _ = Reader(fp2)
        fp.seek(0)
        Writer(fp, "struct<col0:int,col1:string>").close()
        with open(fp.name, "ab") as fp2:
            with pytest.raises(io.UnsupportedOperation):
                _ = Reader(fp2)
        with open(fp.name, "rb") as fp2:
            reader = Reader(fp2)
            assert reader is not None
            assert len(reader) == 0
Example #14
0
def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input,
                                  expected):
    data = io.BytesIO()
    with Writer(data, schema, timezone=writer_tz) as writer:
        writer.write((input, ))
    reader = Reader(data, timezone=reader_tz)
    output = next(reader)[0]
    assert output == expected
Example #15
0
def test_wrong_predicate():
    data = io.BytesIO()
    with Writer(data, "struct<c0:int,c1:string>",
                row_index_stride=100) as writer:
        writer.writerows(
            (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000))
    data.seek(0)
    with pytest.raises(TypeError):
        reader = Reader(data, predicate="wrong")
Example #16
0
def test_bytes_lengths():
    data = io.BytesIO()
    Writer(data, "string", compression=0).close()
    reader = Reader(data)
    assert reader.bytes_lengths["content_length"] == 0
    assert reader.bytes_lengths["file_footer_length"] == 38
    assert reader.bytes_lengths["file_postscript_length"] == 23
    assert reader.bytes_lengths["file_length"] == 65
    assert reader.bytes_lengths["stripe_statistics_length"] == 0
    data = io.BytesIO()
    with Writer(data, "int") as writer:
        writer.writerows(range(100))
    reader = Reader(data)
    assert reader.bytes_lengths["content_length"] == 76
    assert reader.bytes_lengths["file_footer_length"] == 59
    assert reader.bytes_lengths["file_postscript_length"] == 23
    assert reader.bytes_lengths["file_length"] == len(data.getvalue())
    assert reader.bytes_lengths["stripe_statistics_length"] == 21
Example #17
0
def test_len():
    data = io.BytesIO()
    Writer(data, "struct<col0:int,col1:string>").close()
    reader = Reader(data)
    assert len(reader) == 0

    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        writer.write((0, "Test A"))
    reader = Reader(data)
    assert len(reader) == 1

    data = io.BytesIO()
    with Writer(data, "struct<col0:int,col1:string>") as writer:
        for i in range(10):
            writer.write((i, "Test"))
    reader = Reader(data)
    assert len(reader) == 10
Example #18
0
def test_empty_predicate_result():
    data = io.BytesIO()
    with Writer(data, "struct<c0:int,c1:string>",
                row_index_stride=100) as writer:
        writer.writerows(
            (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000))
    data.seek(0)
    reader = Reader(data, predicate=PredicateColumn(TypeKind.INT, "c0") < 0)
    assert len(reader) != 0
    assert list(reader) == []
Example #19
0
def test_writer_id():
    data = io.BytesIO()
    with Writer(data, "int") as writer:
        writer.writerows(range(10))
    reader = Reader(data)
    with pytest.raises(AttributeError):
        reader.writer_id = "fail"
    with pytest.raises(AttributeError):
        del reader.writer_id
    assert reader.writer_id == "ORC_CPP_WRITER"
Example #20
0
def test_compression(kind):
    data = io.BytesIO()
    with Writer(data, "int", compression=kind) as writer:
        writer.writerows(range(10))
    reader = Reader(data)
    with pytest.raises(AttributeError):
        reader.compression = "fail"
    with pytest.raises(AttributeError):
        del reader.compression
    assert reader.compression == kind
Example #21
0
def test_compression(kind):
    data = io.BytesIO()
    with Writer(data, "struct<a:int,b:string,c:double>",
                compression=kind) as writer:
        writer.writerows((num, "ABCDEFG", 0.12) for num in range(50000))
    data.seek(0)
    reader = Reader(data)
    assert reader.compression == kind
    for idx, row in enumerate(reader):
        assert row == (idx, "ABCDEFG", 0.12)
Example #22
0
def test_read_custom_null_value(orc_type, value):
    data = io.BytesIO()
    with Writer(data, orc_type) as writer:
        writer.write(value)
        writer.write(None)
    reader = Reader(data, null_value=NullValue())
    if orc_type in ("float", "double"):
        assert math.isclose(next(reader), value, rel_tol=1e-07, abs_tol=0.0)
    else:
        assert next(reader) == value
    assert next(reader) is NullValue()
Example #23
0
def test_current_row():
    data = io.BytesIO()
    writer = Writer(data, "struct<col0:int,col1:string,col2:double>")
    assert writer.current_row == 0
    writer.write((0, "Test A", 0.0001))
    assert writer.current_row == 1
    for i in range(10):
        writer.write((i, "Test A", 0.0001))
    assert writer.current_row == 11
    writer.close()
    data.seek(0)
    reader = Reader(data)
    assert writer.current_row == len(reader)
Example #24
0
 def _init(row):
     data = io.BytesIO()
     with Writer(
             data,
             "struct<col0:int>",
             batch_size=65535,
             stripe_size=128,
             compression_block_size=128,
     ) as writer:
         for i in range(row):
             writer.write((i, ))
     data.seek(0)
     return data
Example #25
0
 def _init(schema, rows, bfc=tuple()):
     data = io.BytesIO()
     with Writer(
         data,
         schema,
         batch_size=65535,
         stripe_size=128,
         compression_block_size=128,
         bloom_filter_columns=bfc,
     ) as writer:
         writer.writerows(rows)
     data.seek(0)
     return data
Example #26
0
def test_metadata():
    data = io.BytesIO()
    with Writer(data, "int") as writer:
        writer.set_metadata(test="test1".encode("UTF-8"), meta=b"\x30\x40\x50\x60")
        writer.set_metadata(test="test2".encode("UTF-8"))
        with pytest.raises(TypeError):
            writer.set_metadata(meta="string")
    reader = Reader(data)
    assert len(reader) == 0
    assert reader.metadata == {
        "test": "test2".encode("UTF-8"),
        "meta": b"\x30\x40\x50\x60",
    }
Example #27
0
 def _init(row):
     data = io.BytesIO()
     with Writer(data,
                 "struct<col0:int,col1:string>",
                 struct_repr=StructRepr.DICT) as writer:
         for i in range(row):
             writer.write({
                 "col0":
                 i,
                 "col1":
                 "Test {0}".format(string.ascii_uppercase[i % 26]),
             })
     data.seek(0)
     return data
Example #28
0
def test_write():
    data = io.BytesIO()
    writer = Writer(data, "struct<col0:int,col1:string,col2:double>")
    records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)]
    for rec in records:
        writer.write(rec)
    writer.close()
    data.seek(0)
    reader = Reader(data)
    assert reader.read() == records
Example #29
0
def test_write_complex_type(orc_type, values):
    data = io.BytesIO()
    writer = Writer(data, orc_type, struct_repr=StructRepr.DICT)
    for rec in values:
        writer.write(rec)
    writer.close()

    data.seek(0)
    reader = Reader(data, struct_repr=StructRepr.DICT)
    assert reader.read() == values
Example #30
0
def test_schema():
    schema_str = "struct<col0:int,col1:string>"
    data = io.BytesIO()
    Writer(data, schema_str).close()
    reader = Reader(data)

    assert str(reader.schema) == schema_str
    with pytest.raises(AttributeError):
        reader.schema = "fail"
    with pytest.raises(AttributeError):
        del reader.schema

    schema = reader.schema
    del reader
    assert isinstance(schema, typedescription)
    assert schema.kind == TypeKind.STRUCT