def test_bytes_offset(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 1) assert stripe.bytes_offset == 658 # Bold, hardcoded offset value. with pytest.raises(AttributeError): stripe.bytes_offset = 5
def test_bytes_length(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 1) assert stripe.bytes_length == 392 # Bold, hardcoded length value. with pytest.raises(AttributeError): stripe.bytes_length = "false"
def test_row_offset(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe0 = Stripe(reader, 0) assert stripe0.row_offset == 0 assert Stripe(reader, 1).row_offset == len(stripe0) with pytest.raises(AttributeError): stripe0.row_offset = 5
def test_init(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) with pytest.raises(TypeError): _ = Stripe(None, 0) with pytest.raises(TypeError): _ = Stripe("reader", 0) with pytest.raises(IndexError): _ = Stripe(reader, 3) with pytest.raises(TypeError): _ = Stripe(reader, "col") assert Stripe(reader, 0) is not None
def test_writer_timezone(striped_orc_data): def get_dt(): start = datetime(2010, 9, 1, 7, 0, 0, 0, timezone.utc) end = datetime(2010, 9, 10, 12, 0, 0, 0, timezone.utc) while start <= end: yield start start += timedelta(seconds=10) data = striped_orc_data("timestamp", get_dt()) reader = Reader(data) stripe = Stripe(reader, 1) assert stripe.writer_timezone == "UTC" with pytest.raises(AttributeError): stripe.writer_timezone = "UTC-9:00"
def test_statistics_string(striped_orc_data): data = striped_orc_data("string", ("Test String {0}".format(i + 1) for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["kind"] == TypeKind.STRING assert stat["number_of_values"] == 65535 assert stat["total_length"] == sum(len(i) for i in stripe) assert stat["minimum"] == "Test String 1" assert stat["maximum"] == max(i for i in Stripe(reader, 0)) stat = reader[0].statistics assert stat["maximum"] == max(i for i in reader) assert reader.read_stripe( 1)[0].statistics["minimum"] == "Test String 100000"
def test_len(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) assert len(reader) != len(stripe) assert len(stripe) == 65535
def test_getitem(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) col = reader[0] assert col is not None col = stripe[0] assert col is not None
def test_bloom_filter_columns(striped_orc_data): expected = (0, 1) data = striped_orc_data( "struct<col0:int,col1:string>", ((i, "Test {}".format(i + 1)) for i in range(100000)), bfc=expected, ) reader = Reader(data) assert Stripe(reader, 0).bloom_filter_columns == expected assert Stripe(reader, 1).bloom_filter_columns == expected data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) assert stripe.bloom_filter_columns == tuple() with pytest.raises(AttributeError): stripe.bloom_filter_columns = (0,)
def test_statistics_binary(striped_orc_data): data = striped_orc_data("binary", (b"\x4D\x45\x34\x01" for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["kind"] == TypeKind.BINARY assert stat["number_of_values"] == 65535 assert stat["total_length"] == sum(len(i) for i in stripe) stat = reader[0].statistics assert stat["total_length"] == sum(len(i) for i in reader)
def test_statistics_date(striped_orc_data): data = striped_orc_data("date", (date(1900, 1, 1) + timedelta(days=i) for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["kind"] == TypeKind.DATE assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["minimum"] == date(1900, 1, 1) assert stat["maximum"] == date(2079, 6, 5) stat = reader[0].statistics assert stat["maximum"] == max(i for i in reader)
def test_statistics_int(striped_orc_data): data = striped_orc_data("int", (i for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.INT assert stat["minimum"] == 0 assert stat["maximum"] == 65534 assert stat["sum"] == sum(i for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert stat["maximum"] == 99999 assert stat["sum"] == sum(i for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
def test_statistics_double(striped_orc_data): data = striped_orc_data("double", (i * 0.1 for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.DOUBLE assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 6553.4) assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe))) stat = reader[0].statistics assert stat["minimum"] == 0 assert math.isclose(stat["maximum"], 9999.9) assert stat["sum"] == sum(i * 0.1 for i in range(100000)) assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
def test_init(striped_orc_data): data = striped_orc_data("struct<a:int,b:int>", ((i, i * 5) for i in range(100000))) reader = Reader(data, column_indices=(1, )) stripe = Stripe(reader, 0) with pytest.raises(TypeError): _ = Column(stripe, "0") with pytest.raises(IndexError): _ = Column(stripe, 100) with pytest.raises(IndexError): _ = Column(reader, 100) with pytest.raises(IndexError): _ = Column(reader, 1) col = Column(stripe, 0) assert col is not None col = Column(reader, 0) assert col is not None
def test_statistics_bool(striped_orc_data): data = striped_orc_data("struct<a:boolean>", (((True, False, None)[i % 3], ) for i in range(100000))) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["has_null"] is False assert stat["number_of_values"] == 65535 assert stat["kind"] == TypeKind.STRUCT stat = stripe[1].statistics assert stat["has_null"] is True assert stat["kind"] == TypeKind.BOOLEAN assert stat["number_of_values"] == 43690 assert stat["false_count"] == 21845 assert stat["true_count"] == len([i for i, in stripe if i is True]) stat = reader[1].statistics assert stat["has_null"] is True assert stat["number_of_values"] == 66667 assert stat["false_count"] == len([i for i, in reader if i is False]) assert stat["true_count"] == 33334 assert reader[0].statistics["number_of_values"] == 100000
def test_statistics_decimal(striped_orc_data): data = striped_orc_data( "decimal(10,3)", (Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)), ) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["kind"] == TypeKind.DECIMAL assert stat["has_null"] is False assert stat["number_of_values"] == len(stripe) assert stat["minimum"] == Decimal("1010.100") assert stat["maximum"] == Decimal("7563.500") assert stat["sum"] == sum( Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(len(stripe))).quantize(Decimal("1.000")) stat = reader[0].statistics assert stat["sum"] == sum( Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)).quantize(Decimal("1.000")) assert reader.read_stripe(1)[0].statistics["minimum"] == Decimal( "7563.600")
def test_statistics_timestamp(striped_orc_data): data = striped_orc_data( "timestamp", (datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) + timedelta(minutes=i) for i in range(100000)), ) reader = Reader(data) stripe = Stripe(reader, 0) stat = stripe[0].statistics assert stat["kind"] == TypeKind.TIMESTAMP assert stat["has_null"] is False assert stat["number_of_values"] == len(stripe) assert stat["minimum"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) assert stat["maximum"] == max(i for i in stripe) assert stat["lower_bound"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) assert stat["upper_bound"] == datetime(2000, 2, 16, 0, 14, 0, 1000, tzinfo=timezone.utc) stat = reader[0].statistics assert stat["maximum"] == max(i for i in reader) assert stat["upper_bound"] == datetime(2000, 3, 10, 22, 39, 0, 1000, tzinfo=timezone.utc)