def test_complex_predicate_results(): data = io.BytesIO() with Writer(data, "struct<c0:int,c1:string>", row_index_stride=100) as writer: writer.writerows((i, "A") if i > 300 and i <= 450 else (i, "B") for i in range(1000)) data.seek(0) reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") < 100) & (PredicateColumn(TypeKind.STRING, "c1") == "A"), ) assert list(reader) == [] reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") > 300) & (PredicateColumn(TypeKind.STRING, "c1") == "A"), ) result = list(reader) assert len(result) == 200 assert sum(1 if row[1] == "A" else 0 for row in result) == 150 reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") >= 400) & (PredicateColumn(TypeKind.STRING, "c1") != "A"), ) result = list(reader) assert len(result) == 600 reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") < 100) | (PredicateColumn(TypeKind.STRING, index=2) != "B"), ) result = list(reader) assert len(result) == 300
def test_context_manager(): data = io.BytesIO() records = [ { "col0": 1, "col1": "Test A", "col2": 2.13 }, { "col0": 2, "col1": "Test B", "col2": 0.123213 }, { "col0": 3, "col1": "Test C", "col2": 123.011234 }, ] with Writer(data, "struct<col0:int,col1:string,col2:double>", struct_repr=StructRepr.DICT) as writer: for rec in records: writer.write(rec) data.seek(0) reader = Reader(data, struct_repr=StructRepr.DICT) assert reader.read() == records
def test_init(orc_data): with pytest.raises(TypeError): _ = Reader(0) with pytest.raises(TypeError): _ = Reader(orc_data(1), "fail") reader = Reader(orc_data(2), 1) assert reader is not None
def test_write_complex_type(orc_type, values): data = io.BytesIO() writer = Writer(data, orc_type, struct_repr=StructRepr.DICT) for rec in values: writer.write(rec) writer.close() data.seek(0) reader = Reader(data, struct_repr=StructRepr.DICT) assert reader.read() == values
def test_compression(kind): data = io.BytesIO() with Writer(data, "int", compression=kind) as writer: writer.writerows(range(10)) reader = Reader(data) with pytest.raises(AttributeError): reader.compression = "fail" with pytest.raises(AttributeError): del reader.compression assert reader.compression == kind
def test_writer_id(): data = io.BytesIO() with Writer(data, "int") as writer: writer.writerows(range(10)) reader = Reader(data) with pytest.raises(AttributeError): reader.writer_id = "fail" with pytest.raises(AttributeError): del reader.writer_id assert reader.writer_id == "ORC_CPP_WRITER"
def test_write(): data = io.BytesIO() writer = Writer(data, "struct<col0:int,col1:string,col2:double>") records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)] for rec in records: writer.write(rec) writer.close() data.seek(0) reader = Reader(data) assert reader.read() == records
def test_include(): data = io.BytesIO() record = {"col0": 1, "col1": "Test A", "col2": 3.14} with Writer(data, "struct<col0:int,col1:string,col2:double>", struct_repr=StructRepr.DICT) as writer: writer.write(record) data.seek(0) reader = Reader(data, column_indices=[0], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1} reader = Reader(data, column_indices=[0, 2], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1, "col2": 3.14} with pytest.raises(TypeError): _ = Reader(data, column_indices=[0, "2"], struct_repr=StructRepr.DICT) reader = Reader(data, column_names=["col0"], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1} reader = Reader(data, column_names=["col1", "col2"], struct_repr=StructRepr.DICT) assert next(reader) == {"col1": "Test A", "col2": 3.14} with pytest.raises(TypeError): _ = Reader(data, column_names=["col1", 2], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_indices=[10], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_names=["col5"], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_names=["col1"], column_indices=[2], struct_repr=StructRepr.DICT)
def test_writerows(): data = io.BytesIO() writer = Writer(data, "int") rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) res = writer.writerows(rows) writer.close() assert res == len(rows) data.seek(0) reader = Reader(data) assert list(rows) == reader.read()
def test_next(): data = io.BytesIO() Writer(data, "struct<col0:int,col1:string>").close() with pytest.raises(StopIteration): reader = Reader(data) next(reader) expected = (0, "Test A") data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: writer.write(expected) reader = Reader(data) assert next(reader) == expected with pytest.raises(StopIteration): next(reader)
def test_current_row(orc_data): reader = Reader(orc_data(20)) assert reader.current_row == 0 for _ in range(10): _ = next(reader) assert reader.current_row == 10 res = next(reader) assert reader.current_row == 11 assert res[0] == 10 _ = [_ for _ in reader] assert reader.current_row == len(reader) with pytest.raises(AttributeError): reader.current_row = "fail" with pytest.raises(AttributeError): del reader.current_row
def test_schema(): schema_str = "struct<col0:int,col1:string>" data = io.BytesIO() Writer(data, schema_str).close() reader = Reader(data) assert str(reader.schema) == schema_str with pytest.raises(AttributeError): reader.schema = "fail" with pytest.raises(AttributeError): del reader.schema schema = reader.schema del reader assert isinstance(schema, TypeDescription) assert schema.kind == TypeKind.STRUCT
def test_statistics_double(striped_orc_data): data = striped_orc_data("double", (i * 0.1 for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.DOUBLE assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 6553.4) assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 9999.9) assert stat["sum"] == sum(i * 0.1 for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
def test_iter(orc_data): reader = Reader(orc_data(20)) result = [row for row in reader] assert len(result) == 20 assert (0, "Test A") == result[0] assert (19, "Test T") == result[-1] assert (12, "Test M") in result
def test_statistics_string(striped_orc_data): data = striped_orc_data("string", ("Test String {0}".format(i + 1) for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["kind"] == TypeKind.STRING assert stat["number_of_values"] == 65535 assert stat["total_length"] == sum(len(i) for i in stripe) assert stat["minimum"] == "Test String 1" assert stat["maximum"] == max(i for i in Stripe(reader, 0)) stat = reader[0].statistics assert stat["maximum"] == max(i for i in reader) assert reader.read_stripe( 1)[0].statistics["minimum"] == "Test String 100000"
def test_len(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) assert len(reader) != len(stripe) assert len(stripe) == 65535
def test_statistics_int(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.INT assert stat["minimum"] == 0 assert stat["maximum"] == 65534 assert stat["sum"] == sum(i for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert stat["maximum"] == 99999 assert stat["sum"] == sum(i for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
def test_bytes_length(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 1) assert stripe.bytes_length == 392 # Bold, hardcoded length value. with pytest.raises(AttributeError): stripe.bytes_length = "false"
def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input, expected): data = io.BytesIO() with Writer(data, schema, timezone=writer_tz) as writer: writer.write((input, )) reader = Reader(data, timezone=reader_tz) output = next(reader)[0] assert output == expected
def test_bloom_filter_columns(striped_orc_data): expected = (0, 1) data = striped_orc_data( "struct<col0:int,col1:string>", ((i, "Test {}".format(i + 1)) for i in range(100000)), bfc=expected, ) reader = Reader(data) assert Stripe(reader, 0).bloom_filter_columns == expected assert Stripe(reader, 1).bloom_filter_columns == expected data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) assert stripe.bloom_filter_columns == tuple() with pytest.raises(AttributeError): stripe.bloom_filter_columns = (0,)
def test_bytes_offset(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 1) assert stripe.bytes_offset == 658 # Bold, hardcoded offset value. with pytest.raises(AttributeError): stripe.bytes_offset = 5
def test_getitem(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) col = reader[0] assert col is not None col = stripe[0] assert col is not None
def test_attributes(schema, attrs): data = io.BytesIO() schema.set_attributes(attrs) writer = Writer(data, schema) writer.close() reader = Reader(data) assert len(reader) == 0 assert reader.schema.attributes == attrs
def test_converting_predicate(orc_type, value): if orc_type.kind == TypeKind.DECIMAL: pred_col = PredicateColumn(orc_type.kind, "c0", precision=orc_type.precision, scale=orc_type.scale) else: pred_col = PredicateColumn(orc_type.kind, "c0") data = io.BytesIO() with Writer(data, f"struct<c0:{orc_type}>", row_index_stride=1) as writer: writer.write((value, )) writer.write((None, )) reader = Reader(data, predicate=(pred_col == value)) result = list(reader) assert result == [(value, )] reader = Reader(data, predicate=(pred_col == None)) result = list(reader) assert result == [(None, )]
def test_selected_schema(): schema_str = "struct<col0:int,col1:string>" data = io.BytesIO() Writer(data, schema_str).close() reader = Reader(data, column_names=("col1",)) assert str(reader.schema) == schema_str assert str(reader.selected_schema) != str(reader.schema) with pytest.raises(AttributeError): reader.selected_schema = "fail" with pytest.raises(AttributeError): del reader.selected_schema schema = reader.selected_schema del reader assert isinstance(schema, typedescription) assert schema.kind == TypeKind.STRUCT assert str(schema) == "struct<col1:string>"
def test_row_offset(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe0 = Stripe(reader, 0) assert stripe0.row_offset == 0 assert Stripe(reader, 1).row_offset == len(stripe0) with pytest.raises(AttributeError): stripe0.row_offset = 5
def test_wrong_predicate(): data = io.BytesIO() with Writer(data, "struct<c0:int,c1:string>", row_index_stride=100) as writer: writer.writerows( (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000)) data.seek(0) with pytest.raises(TypeError): reader = Reader(data, predicate="wrong")
def test_bytes_lengths(): data = io.BytesIO() Writer(data, "string", compression=0).close() reader = Reader(data) assert reader.bytes_lengths["content_length"] == 0 assert reader.bytes_lengths["file_footer_length"] == 38 assert reader.bytes_lengths["file_postscript_length"] == 23 assert reader.bytes_lengths["file_length"] == 65 assert reader.bytes_lengths["stripe_statistics_length"] == 0 data = io.BytesIO() with Writer(data, "int") as writer: writer.writerows(range(100)) reader = Reader(data) assert reader.bytes_lengths["content_length"] == 76 assert reader.bytes_lengths["file_footer_length"] == 59 assert reader.bytes_lengths["file_postscript_length"] == 23 assert reader.bytes_lengths["file_length"] == len(data.getvalue()) assert reader.bytes_lengths["stripe_statistics_length"] == 21
def test_len(): data = io.BytesIO() Writer(data, "struct<col0:int,col1:string>").close() reader = Reader(data) assert len(reader) == 0 data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: writer.write((0, "Test A")) reader = Reader(data) assert len(reader) == 1 data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: for i in range(10): writer.write((i, "Test")) reader = Reader(data) assert len(reader) == 10
def test_converting_predicate_error(): data = io.BytesIO() with Writer(data, f"struct<c0:string>", row_index_stride=1) as writer: writer.write(("test", )) with pytest.raises(TypeError): _ = Reader( data, predicate=(PredicateColumn(TypeKind.STRING) < "test"), ) with pytest.raises(TypeError): _ = Reader( data, predicate=(PredicateColumn(TypeKind.STRING) == "test"), ) with pytest.raises(TypeError): _ = Reader( data, predicate=(PredicateColumn(TypeKind.STRING) <= "test"), )