Beispiel #1
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    'Import data from a Parquet file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {'imported_from': 'parquet', 'filename': filename,}
    return create_table([header] + table_rows, meta=meta, force_types=types,
                        *args, **kwargs)
    def read(self, path: Path):
        try:
            import parquet
        except ImportError as e:
            raise ImportError(f"{e}. HINT: You can install Parquet by running "
                              "'pip install parquet'")

        with path.open("rt") as tf:
            columns = [self.lhs_col, self.rhs_col]
            if self.rel_col is not None:
                columns.append(self.rel_col)
            for row in parquet.reader(tf, columns=columns):
                if self.rel_col is not None:
                    yield row
                else:
                    yield (row[0], row[1], None)
Beispiel #3
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    """Import data from a Parquet file and return with rows.Table."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {"imported_from": "parquet", "filename": filename}
    return create_table([header] + table_rows,
                        meta=meta,
                        force_types=types,
                        *args,
                        **kwargs)
Beispiel #4
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    """Import data from a Parquet file and return with rows.Table."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict(
        [
            (schema.name, PARQUET_TO_ROWS[schema.type])
            for schema in parquet._read_footer(fobj).schema
            if schema.type is not None
        ]
    )
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {"imported_from": "parquet", "filename": filename}
    return create_table(
        [header] + table_rows, meta=meta, force_types=types, *args, **kwargs
    )
Beispiel #5
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    'Import data from a Parquet file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {
        'imported_from': 'parquet',
        'filename': filename,
    }
    return create_table([header] + table_rows,
                        meta=meta,
                        force_types=types,
                        *args,
                        **kwargs)
Beispiel #6
0
    def read(self, path: Path):
        try:
            import parquet
        except ImportError as e:
            raise ImportError(f"{e}. HINT: You can install Parquet by running "
                              "'pip install parquet'")

        with path.open("rb") as tf:
            columns = [
                self.lhs_col, self.rhs_col, self.rel_col, self.weight_col
            ]
            fetch_columns = [c for c in columns if c is not None]
            for row in parquet.reader(tf, columns=fetch_columns):
                offset = 0
                ret = []
                for c in columns:
                    if c is not None:
                        ret.append(row[offset])
                        offset += 1
                    else:
                        ret.append(None)

                yield tuple(ret)
Beispiel #7
0
#!/usr/bin/python

import parquet as pq
import sys
from io import BytesIO

# Purpose: Read parquet files fro stdin to pipeline in MemSQL

# Read binary input from stdin
# Python 2
#file = sys.stdin.read()
# Python 3
file = sys.stdin.buffer.read()

# Convert file object
data = BytesIO(file)

# Read the rows in the parquet file
# Print out CSV to MemSQL
for row in pq.reader(data):
    memsql_row = (str(row[0]) + ',' + str(row[1]) + ',' + str(row[2]) + ',' +
                  str(row[3]))
    print(memsql_row.encode('ascii'))
Beispiel #8
0
import parquet
import pandas as pd

home = os.path.expanduser("~")
dir = "/media/sumeyer/SSD_2/ML_DATA/"
filename = "part-r-00000-67ebd6f0-bfb4-42e0-b516-d7aaa77cbcb8.snappy.parquet"
datafile = dir + filename

print("open file : ", datafile)

## assuming parquet file with two rows and three columns:
## foo bar baz
## 1   2   3
## 4   5   6

with open(datafile) as fo:
    # prints:
    # {"foo": 1, "bar": 2}
    # {"foo": 4, "bar": 5}
    for row in parquet.DictReader(fo):
        print(json.dumps(row))

with open(datafile) as fo:
    # prints:
    # 1,2
    # 4,5
    for row in parquet.reader(fo):
        print(",".join([str(r) for r in row]))

print(df.info())
print(df)
Beispiel #9
0
# https://github.com/jcrobak/parquet-python

# sudo pip3 install parquet

import parquet
import json

## assuming parquet file with two rows and three columns:
## foo bar baz
## 1   2   3
## 4   5   6

with open("test.parquet") as fo:
   # prints:
   # {"foo": 1, "bar": 2}
   # {"foo": 4, "bar": 5}
   for row in parquet.DictReader(fo, columns=['foo', 'bar']):
       print(json.dumps(row))


with open("test.parquet") as fo:
   # prints:
   # 1,2
   # 4,5
   for row in parquet.reader(fo, columns=['foo', 'bar]):
       print(",".join([str(r) for r in row]))

Beispiel #10
0
#    content = fs.read()
#s = str(content, 'utf-8')
#file = open("D:\data.csv", "w")
#file.write(s)
#df = pd.read_csv("D:\data.csv", names=COLUMNNAMES)
#print df



import parquet
import json

## assuming parquet file with two rows and three columns:
## foo bar baz
## 1   2   3
## 4   5   6

with open("test.parquet") as fo:
   # prints:
   # {"foo": 1, "bar": 2}
   # {"foo": 4, "bar": 5}
   for row in parquet.DictReader(fo, columns=['foo', 'bar']):
       print(json.dumps(row))


with open("test.parquet") as fo:
   # prints:
   # 1,2
   # 4,5
   for row in parquet.reader(fo, columns=['foo', 'bar]):
       print(",".join([str(r) for r in row]))