Esempio n. 1
0
def orc_type(field):
    if pa.types.is_boolean(field):
        return pyorc.Boolean()
    elif pa.types.is_int8(field):
        return pyorc.TinyInt()
    elif pa.types.is_int16(field):
        return pyorc.SmallInt()
    elif pa.types.is_int32(field):
        return pyorc.Int()
    elif pa.types.is_int64(field):
        return pyorc.BigInt()
    elif pa.types.is_float32(field):
        return pyorc.Float()
    elif pa.types.is_float64(field):
        return pyorc.Double()
    elif pa.types.is_decimal(field):
        return pyorc.Decimal(field.precision, field.scale)
    elif pa.types.is_list(field):
        return pyorc.Array(orc_type(field.value_type))
    elif pa.types.is_timestamp(field):
        return pyorc.Timestamp()
    elif pa.types.is_date(field):
        return pyorc.Date()
    elif pa.types.is_binary(field):
        return pyorc.Binary()
    elif pa.types.is_string(field):
        return pyorc.String()
    else:
        raise ValueError('Cannot Convert %s' % field)
Esempio n. 2
0
def generate_list_struct_buff(size=28000):
    rd = random.Random(1)
    np.random.seed(seed=1)

    buff = BytesIO()

    schema = {
        "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))),
        "lvl1_list": po.Array(po.BigInt()),
        "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
        "lvl2_struct": po.Struct(
            **{
                "a": po.BigInt(),
                "lvl1_struct": po.Struct(
                    **{"c": po.BigInt(), "d": po.BigInt()}
                ),
            }
        ),
        "list_nests_struct": po.Array(
            po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}))
        ),
        "struct_nests_list": po.Struct(
            **{
                "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
                "list": po.Array(po.BigInt()),
            }
        ),
    }

    schema = po.Struct(**schema)

    lvl3_list = [
        rd.choice(
            [
                None,
                [
                    [
                        [
                            rd.choice([None, np.random.randint(1, 3)])
                            for z in range(np.random.randint(1, 3))
                        ]
                        for z in range(np.random.randint(0, 3))
                    ]
                    for y in range(np.random.randint(0, 3))
                ],
            ]
        )
        for x in range(size)
    ]
    lvl1_list = [
        [
            rd.choice([None, np.random.randint(0, 3)])
            for y in range(np.random.randint(1, 4))
        ]
        for x in range(size)
    ]
    lvl1_struct = [
        rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))])
        for x in range(size)
    ]
    lvl2_struct = [
        rd.choice(
            [
                None,
                (
                    rd.choice([None, np.random.randint(0, 3)]),
                    (
                        rd.choice([None, np.random.randint(0, 3)]),
                        np.random.randint(0, 3),
                    ),
                ),
            ]
        )
        for x in range(size)
    ]
    list_nests_struct = [
        [
            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
            for y in range(np.random.randint(1, 4))
        ]
        for x in range(size)
    ]
    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]

    df = pd.DataFrame(
        {
            "lvl3_list": lvl3_list,
            "lvl1_list": lvl1_list,
            "lvl1_struct": lvl1_struct,
            "lvl2_struct": lvl2_struct,
            "list_nests_struct": list_nests_struct,
            "struct_nests_list": struct_nests_list,
        }
    )

    writer = po.Writer(buff, schema, stripe_size=1024)
    tuples = list(
        map(
            lambda x: (None,) if x[0] is pd.NA else x,
            list(df.itertuples(index=False, name=None)),
        )
    )
    writer.writerows(tuples)
    writer.close()

    return buff
Esempio n. 3
0
def gen_map_buff(size=10000):
    from string import ascii_letters as al

    rd = random.Random(1)
    np.random.seed(seed=1)

    buff = BytesIO()

    schema = {
        "lvl1_map":
        po.Map(key=po.String(), value=po.BigInt()),
        "lvl2_map":
        po.Map(key=po.String(), value=po.Array(po.BigInt())),
        "lvl2_struct_map":
        po.Map(
            key=po.String(),
            value=po.Struct(**{
                "a": po.BigInt(),
                "b": po.BigInt()
            }),
        ),
    }

    schema = po.Struct(**schema)

    lvl1_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([None, np.random.randint(1, 1500)]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    [
                        rd.choice([None, np.random.randint(1, 1500)])
                        for z in range(5)
                    ],
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_struct_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    (
                        rd.choice([None, np.random.randint(1, 1500)]),
                        rd.choice([None, np.random.randint(1, 1500)]),
                    ),
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]

    pdf = pd.DataFrame({
        "lvl1_map": lvl1_map,
        "lvl2_map": lvl2_map,
        "lvl2_struct_map": lvl2_struct_map,
    })
    writer = po.Writer(buff,
                       schema,
                       stripe_size=1024,
                       compression=po.CompressionKind.NONE)
    tuples = list(
        map(
            lambda x: (None, ) if x[0] is pd.NA else x,
            list(pdf.itertuples(index=False, name=None)),
        ))

    writer.writerows(tuples)
    writer.close()

    return buff