Exemple #1
0
def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
    schema = get_orc_schema(df)

    tuple_list = _preprocess_to_orc_tuple(df)

    if file_name is not None:
        with open(file_name, "wb") as data:
            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
                writer.writerows(tuple_list)
    elif file_io_obj is not None:
        with pyorc.Writer(file_io_obj, schema,
                          stripe_size=stripe_size) as writer:
            writer.writerows(tuple_list)
Exemple #2
0
def write_orc(
    vineyard_socket,
    path,
    stream_id,
    storage_options,
    write_options,
    proc_num,
    proc_index,
):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: DataframeStream = streams[proc_index]
    reader = instream.open_reader(client)

    writer = None
    path += f"_{proc_index}"
    with fsspec.open(path, "wb", **storage_options) as f:
        while True:
            try:
                batch = reader.next()
            except (StopIteration, vineyard.StreamDrainedException):
                writer.close()
                break
            if writer is None:
                # get schema
                schema = {}
                for field in batch.schema:
                    schema[field.name] = orc_type(field.type)
                writer = pyorc.Writer(f, pyorc.Struct(**schema))
            writer.writerows(batch.to_pandas.itertuples(False, None))
        writer.close()
Exemple #3
0
def write_hdfs_orc(vineyard_socket, stream_id, path, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}'
        )
    instream = streams[proc_index]
    reader = instream.open_reader(client)

    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host, port=int(port))
    path = urlparse(path).path

    writer = None
    with hdfs.open(path, 'wb') as f:
        while True:
            try:
                buf = reader.next()
            except:
                writer.close()
                break
            buf_reader = pa.ipc.open_stream(buf)
            if writer is None:
                #get schema
                schema = {}
                for field in buf_reader.schema:
                    schema[field.name] = orc_type(field.type)
                writer = pyorc.Writer(f, pyorc.Struct(**schema))
            for batch in buf_reader:
                df = batch.to_pandas()
                writer.writerows(df.itertuples(False, None))
def write_local_orc(vineyard_socket, stream_id, path, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}'
        )
    instream = streams[proc_index]
    reader = instream.open_reader(client)

    writer = None
    with open(path, 'wb') as f:
        while True:
            try:
                buf = reader.next()
            except vineyard.StreamDrainedException:
                writer.close()
                break
            buf_reader = pa.ipc.open_stream(buf)
            if writer is None:
                # get schema
                schema = {}
                for field in buf_reader.schema:
                    schema[field.name] = orc_type(field.type)
                writer = pyorc.Writer(f, pyorc.Struct(**schema))
            while True:
                try:
                    batch = buf_reader.read_next_batch()
                except StopIteration:
                    break
                df = batch.to_pandas()
                writer.writerows(df.itertuples(False, None))
Exemple #5
0
def encode_orc(filename: str,
               compression: str = None,
               columns: Iterable[str] = None,
               column_types: Iterable[str] = None,
               skip_header=True):
    buffer = io.BytesIO()
    with open(filename, 'rt') as fd:
        reader = csv.reader(fd)

        # If columns not provided try to read header from the file
        if skip_header or columns is None:
            columns = next(reader)
            column_types = ['string'] * len(columns)

        struct = 'struct<{columns}>'.format(columns=','.join(
            name + ':' + (col_type if col_type else 'string')
            for name, col_type in zip_longest(columns, column_types)))

        if compression in (None, 'zlib', 'zstd'):
            compression_type = getattr(pyorc.CompressionKind,
                                       str(compression).upper())
        else:
            compression_type = pyorc.CompressionKind.NONE
        with pyorc.Writer(buffer, struct,
                          compression=compression_type) as writer:
            for row in reader:
                writer.write(tuple(row))

        if compression in (None, 'zlib', 'zstd'):
            return buffer.getvalue()
        else:
            buffer.seek(0)
            return compress(buffer, compression)
Exemple #6
0
 def test_read(self):
     schema = 'struct<a:int,b:struct<x:string,y:boolean>>'
     files = []
     with tempfile.NamedTemporaryFile() as f1, \
          tempfile.NamedTemporaryFile() as f2:
         files.append(f1.name)
         with pyorc.Writer(f1, schema) as writer:
             writer.write((1, ('x', True)))
         files.append(f2.name)
         with pyorc.Writer(f2, schema) as writer:
             writer.write((2, ('y', False)))
             writer.write((3, ('z', False)))
         with TestPipeline() as p:
             pc = (p | Read(
                 FileSource(
                     file_patterns=files,
                     reader=OrcReader(pyorc_options={
                         'struct_repr': pyorc.StructRepr.DICT,
                     }))))
         assert_that(
             pc,
             equal_to([
                 {
                     'a': 1,
                     'b': {
                         'x': 'x',
                         'y': True,
                     },
                 },
                 {
                     'a': 2,
                     'b': {
                         'x': 'y',
                         'y': False,
                     },
                 },
                 {
                     'a': 3,
                     'b': {
                         'x': 'z',
                         'y': False,
                     },
                 },
             ]))
Exemple #7
0
def write_with_compression(df, schema, compression):
    with open(OUTPUT_FILE_PATH, "wb") as f:
        with pyorc.Writer(f, schema, compression=compression,
                          compression_strategy=pyorc.CompressionStrategy.COMPRESSION) as writer:
            start = timer()
            for i in range(len(df)):
                writer.write(tuple([x for x in df.iloc[i, :]]))
    end = timer()
    print('Time to write orc with {} compression: {} seconds'.format(compression, end - start))
    print('Resulting size: {}'.format(util.get_readable_file_size(OUTPUT_FILE_PATH)))
Exemple #8
0
 def write(self, srt_type: dict = None):
     headers = self.get_header()
     cols = []
     for key in headers:
         ctype = srt_type[key]
         cols.append(f'{key}:{ctype}')
     str_cols = ",".join(cols)
     struct_col = f"struct<{str_cols}>"
     with self.filepath().open("wb") as f:
         with pyorc.Writer(f, struct_col) as writer:
             for r in self.content:
                 writer.write(tuple(r.values()))
Exemple #9
0
 def _hdfs_flush(self, date, data):
     with self.conn.write(f"/krwordcloud/add-article/{date}.orc",
                          overwrite=True) as hf:
         tfname = ''
         with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf:
             tfname = tf.name
             with pyorc.Writer(
                     tf,
                     schema="struct<field0:timestamp,field1:string," +
                     "field2:string,field3:string>",
             ) as of:
                 of.writerows(data)
         with open(tfname, 'rb') as tf:
             for line in tf:
                 hf.write(line)
         os.unlink(tfname)
Exemple #10
0
def test_empty_statistics():
    buff = BytesIO()
    orc_schema = po.Struct(
        a=po.BigInt(),
        b=po.Double(),
        c=po.String(),
        d=po.Decimal(11, 2),
        e=po.Date(),
        f=po.Timestamp(),
        g=po.Boolean(),
        h=po.Binary(),
        i=po.BigInt(),
        # One column with non null value, else cudf/pyorc readers crash
    )
    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
    with po.Writer(buff, orc_schema) as writer:
        writer.write(data)

    got = cudf.io.orc.read_orc_statistics([buff])

    # Check for both file and stripe stats
    for stats in got:
        # Similar expected stats for the first 6 columns in this case
        for col_name in ascii_lowercase[:6]:
            assert stats[0][col_name].get("number_of_values") == 0
            assert stats[0][col_name].get("has_null") is True
            assert stats[0][col_name].get("minimum") is None
            assert stats[0][col_name].get("maximum") is None
        for col_name in ascii_lowercase[:3]:
            assert stats[0][col_name].get("sum") == 0
        # Sum for decimal column is a string
        assert stats[0]["d"].get("sum") == "0"

        assert stats[0]["g"].get("number_of_values") == 0
        assert stats[0]["g"].get("has_null") is True
        assert stats[0]["g"].get("true_count") == 0
        assert stats[0]["g"].get("false_count") == 0

        assert stats[0]["h"].get("number_of_values") == 0
        assert stats[0]["h"].get("has_null") is True
        assert stats[0]["h"].get("sum") == 0

        assert stats[0]["i"].get("number_of_values") == 1
        assert stats[0]["i"].get("has_null") is False
        assert stats[0]["i"].get("minimum") == 1
        assert stats[0]["i"].get("maximum") == 1
        assert stats[0]["i"].get("sum") == 1
Exemple #11
0
def test_statistics_sum_overflow():
    maxint64 = np.iinfo(np.int64).max
    minint64 = np.iinfo(np.int64).min

    buff = BytesIO()
    with po.Writer(buff, po.Struct(a=po.BigInt(), b=po.BigInt(),
                                   c=po.BigInt())) as writer:
        writer.write((maxint64, minint64, minint64))
        writer.write((1, -1, 1))

    file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
    assert file_stats[0]["a"].get("sum") is None
    assert file_stats[0]["b"].get("sum") is None
    assert file_stats[0]["c"].get("sum") == minint64 + 1

    assert stripe_stats[0]["a"].get("sum") is None
    assert stripe_stats[0]["b"].get("sum") is None
    assert stripe_stats[0]["c"].get("sum") == minint64 + 1
 def start_exporting(self):
     """
         Triggered when Scrapy starts exporting. Useful to configure headers etc.
     """
     if not SUPPORTED_EXPORTERS['orc']:
         raise RuntimeError(
             "Error: Cannot export to orc. Cannot import pyorc. Have you installed it?"
         )
     self.orcwriter = pyorc.Writer(
         self.file,
         schema=self.orc_schemastring,
         batch_size=self.orc_batchsize,
         stripe_size=self.orc_stripesize,
         compression=self.orc_compression,
         compression_strategy=self.orc_compressionstrategy,
         compression_block_size=self.orc_blocksize,
         bloom_filter_columns=self.orc_bloomfiltercolumns,
         bloom_filter_fpp=self.orc_bloomfilterfpp,
         struct_repr=pyorc.StructRepr.DICT,
         converters=self.orc_converters)
Exemple #13
0
def test_orc_read_skiprows(tmpdir):
    buff = BytesIO()
    df = pd.DataFrame(
        {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
        dtype=pd.BooleanDtype(),
    )
    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
    tuples = list(
        map(
            lambda x: (None, ) if x[0] is pd.NA else x,
            list(df.itertuples(index=False, name=None)),
        ))
    writer.writerows(tuples)
    writer.close()

    skiprows = 10

    expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True)
    got = cudf.read_orc(buff, skiprows=skiprows)

    assert_eq(expected, got)
Exemple #14
0
ORC_FILE = 'Orc/output/nodes.orc'
ORC_SNAPPY_FILE = 'Orc/output/snappy_nodes.orc'
ORC_ZLIB_FILE = 'Orc/output/zlib_nodes.orc'
JSON_FILE = 'Orc/output/nodes.json'

# Define data schema
schema = "struct<id:int,longitude:float,latitude:float,username:string>"

nodes = []
tree = ET.parse(open(SOURCE_FILE))
for node in tree.iterfind('node'):
    nodes.append((int(node.get('id')), float(node.get('lon')),
                  float(node.get('lat')), node.get('user')))

with open(ORC_FILE, "wb") as data:
    with pyorc.Writer(data, schema,
                      compression=pyorc.CompressionKind.NONE) as writer:
        for node in nodes:
            writer.write(node)

## Looks like SNAPPY and LZO compression aren't supported by ORC yet?
#
# with open(ORC_SNAPPY_FILE, "wb") as data:
#     with pyorc.Writer(data, schema, compression=pyorc.CompressionKind.SNAPPY) as writer:
#         for node in nodes:
#             writer.write(node)
##

with open(ORC_ZLIB_FILE, "wb") as data:
    with pyorc.Writer(data, schema,
                      compression=pyorc.CompressionKind.ZLIB) as writer:
        for node in nodes:
Exemple #15
0
#!/usr/local/bin/python3
import pyorc
from uuid import uuid4

with open('./data.orc', 'wb') as data:
    with pyorc.Writer(data, 'struct<col0:int,col1:string,col2:string,col3:string,col4:string>') as writer:
        for idx in range(10000000):
            uuid = str(uuid4())
            writer.write((idx, uuid + '1', uuid + '2', uuid + '3', uuid + '4'))
        
Exemple #16
0
def gen_map_buff(size=10000):
    from string import ascii_letters as al

    rd = random.Random(1)
    np.random.seed(seed=1)

    buff = BytesIO()

    schema = {
        "lvl1_map":
        po.Map(key=po.String(), value=po.BigInt()),
        "lvl2_map":
        po.Map(key=po.String(), value=po.Array(po.BigInt())),
        "lvl2_struct_map":
        po.Map(
            key=po.String(),
            value=po.Struct(**{
                "a": po.BigInt(),
                "b": po.BigInt()
            }),
        ),
    }

    schema = po.Struct(**schema)

    lvl1_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([None, np.random.randint(1, 1500)]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    [
                        rd.choice([None, np.random.randint(1, 1500)])
                        for z in range(5)
                    ],
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]
    lvl2_struct_map = [
        rd.choice([
            None,
            [(
                rd.choice(al),
                rd.choice([
                    None,
                    (
                        rd.choice([None, np.random.randint(1, 1500)]),
                        rd.choice([None, np.random.randint(1, 1500)]),
                    ),
                ]),
            ) for y in range(2)],
        ]) for x in range(size)
    ]

    pdf = pd.DataFrame({
        "lvl1_map": lvl1_map,
        "lvl2_map": lvl2_map,
        "lvl2_struct_map": lvl2_struct_map,
    })
    writer = po.Writer(buff,
                       schema,
                       stripe_size=1024,
                       compression=po.CompressionKind.NONE)
    tuples = list(
        map(
            lambda x: (None, ) if x[0] is pd.NA else x,
            list(pdf.itertuples(index=False, name=None)),
        ))

    writer.writerows(tuples)
    writer.close()

    return buff
Exemple #17
0
def generate_list_struct_buff(size=28000):
    rd = random.Random(1)
    np.random.seed(seed=1)

    buff = BytesIO()

    schema = {
        "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))),
        "lvl1_list": po.Array(po.BigInt()),
        "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
        "lvl2_struct": po.Struct(
            **{
                "a": po.BigInt(),
                "lvl1_struct": po.Struct(
                    **{"c": po.BigInt(), "d": po.BigInt()}
                ),
            }
        ),
        "list_nests_struct": po.Array(
            po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}))
        ),
        "struct_nests_list": po.Struct(
            **{
                "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
                "list": po.Array(po.BigInt()),
            }
        ),
    }

    schema = po.Struct(**schema)

    lvl3_list = [
        rd.choice(
            [
                None,
                [
                    [
                        [
                            rd.choice([None, np.random.randint(1, 3)])
                            for z in range(np.random.randint(1, 3))
                        ]
                        for z in range(np.random.randint(0, 3))
                    ]
                    for y in range(np.random.randint(0, 3))
                ],
            ]
        )
        for x in range(size)
    ]
    lvl1_list = [
        [
            rd.choice([None, np.random.randint(0, 3)])
            for y in range(np.random.randint(1, 4))
        ]
        for x in range(size)
    ]
    lvl1_struct = [
        rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))])
        for x in range(size)
    ]
    lvl2_struct = [
        rd.choice(
            [
                None,
                (
                    rd.choice([None, np.random.randint(0, 3)]),
                    (
                        rd.choice([None, np.random.randint(0, 3)]),
                        np.random.randint(0, 3),
                    ),
                ),
            ]
        )
        for x in range(size)
    ]
    list_nests_struct = [
        [
            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
            for y in range(np.random.randint(1, 4))
        ]
        for x in range(size)
    ]
    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]

    df = pd.DataFrame(
        {
            "lvl3_list": lvl3_list,
            "lvl1_list": lvl1_list,
            "lvl1_struct": lvl1_struct,
            "lvl2_struct": lvl2_struct,
            "list_nests_struct": list_nests_struct,
            "struct_nests_list": struct_nests_list,
        }
    )

    writer = po.Writer(buff, schema, stripe_size=1024)
    tuples = list(
        map(
            lambda x: (None,) if x[0] is pd.NA else x,
            list(df.itertuples(index=False, name=None)),
        )
    )
    writer.writerows(tuples)
    writer.close()

    return buff
response = detectlanguage.detect(df["text"].values.tolist())
first_languages = list(
    map(
        lambda x: x[0] if x else {
            'isReliable': False,
            'confidence': 0,
            'language': ''
        }, response))

new_df = pd.concat([df, pd.DataFrame(first_languages)], axis=1)

orc_file = ORC_FILE.format(datetime.now().strftime("%y%m%d"))
with open(orc_file, "wb") as data:
    with pyorc.Writer(
            data,
            "struct<text:string,isSpam:boolean,language:string,isReliable:boolean,confidence:float>",
            compression=pyorc.CompressionKind.ZLIB) as writer:
        for index, row in new_df.iterrows():
            writer.write((row['text'], row['isSpam'], row['language'],
                          row['isReliable'], row['confidence']))

new_df.to_csv(index=True)
print(f"Saved {len(new_df)} messages in {orc_file}.")

## For the future, to read the dataset
# with open(ORC_FILE, 'rb') as orc_file:
#     reader = pyorc.Reader(orc_file)

#     # Read embedded schema
#     print(str(reader.schema))