def test_read_stripe(striped_orc_data): reader = Reader(striped_orc_data(655350)) stripe = reader.read_stripe(0) assert isinstance(stripe, Stripe) with pytest.raises(IndexError): _ = reader.read_stripe(11) with pytest.raises(TypeError): _ = reader.read_stripe(-1) with pytest.raises(IndexError): _ = reader.read_stripe(10) stripe = reader.read_stripe(9) assert isinstance(stripe, Stripe)
def test_statistics_array_int(striped_orc_data): data = striped_orc_data( "struct<list:array<int>>", (([j + i for j in range(30)], ) for i in range(100000)), ) reader = Reader(data) stripe = reader.read_stripe(0) stat = stripe[2].statistics assert stripe[1].statistics["kind"] == TypeKind.LIST assert stat["kind"] == TypeKind.INT assert sum(i for col in reader.read_stripe(0) for i in col[0]) == stat["sum"] assert min(i for col in reader.read_stripe(0) for i in col[0]) == stat["minimum"] assert max(i for col in reader.read_stripe(0) for i in col[0]) == stat["maximum"] stat = reader[2].statistics assert max(i for col in reader for i in col[0]) == stat["maximum"]
def test_seek_and_read(striped_orc_data): data = striped_orc_data( "struct<col0:int,col1:string>", ((i, "Test {}".format(i + 1)) for i in range(100000)), ) reader = Reader(data) stripe = reader.read_stripe(1) assert next(stripe) == (65535, "Test 65536") stripe.seek(10000) assert next(stripe) == (75535, "Test 75536") stripe.seek(-1, 2) assert next(stripe) == (99999, "Test 100000") stripe = reader.read_stripe(0) stripe.seek(-1, 2) assert next(stripe) == (65534, "Test 65535") stripe.seek(0) next(stripe) stripe.seek(10000, 1) assert next(stripe) == (10001, "Test 10002") expected = reader.read() result = stripe.read() assert result == expected[10002:65535] stripe = reader.read_stripe(1) assert stripe.read() == expected[65535:]
def test_statistics_int(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.INT assert stat["minimum"] == 0 assert stat["maximum"] == 65534 assert stat["sum"] == sum(i for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert stat["maximum"] == 99999 assert stat["sum"] == sum(i for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
def test_statistics_string(striped_orc_data): data = striped_orc_data("string", ("Test String {0}".format(i + 1) for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["kind"] == TypeKind.STRING assert stat["number_of_values"] == 65535 assert stat["total_length"] == sum(len(i) for i in stripe) assert stat["minimum"] == "Test String 1" assert stat["maximum"] == max(i for i in Stripe(reader, 0)) stat = reader[0].statistics assert stat["maximum"] == max(i for i in reader) assert reader.read_stripe( 1)[0].statistics["minimum"] == "Test String 100000"
def test_statistics_double(striped_orc_data): data = striped_orc_data("double", (i * 0.1 for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.DOUBLE assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 6553.4) assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 9999.9) assert stat["sum"] == sum(i * 0.1 for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
def test_statistics_decimal(striped_orc_data): data = striped_orc_data( "decimal(10,3)", (Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)), ) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["kind"] == TypeKind.DECIMAL assert stat["has_null"] is False assert stat["number_of_values"] == len(stripe) assert stat["minimum"] == Decimal("1010.100") assert stat["maximum"] == Decimal("7563.500") assert stat["sum"] == sum( Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(len(stripe))).quantize(Decimal("1.000")) stat = reader[0].statistics assert stat["sum"] == sum( Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)).quantize(Decimal("1.000")) assert reader.read_stripe(1)[0].statistics["minimum"] == Decimal( "7563.600")