def import_from_parquet(filename_or_fobj, *args, **kwargs): 'Import data from a Parquet file' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def read(self, path: Path): try: import parquet except ImportError as e: raise ImportError(f"{e}. HINT: You can install Parquet by running " "'pip install parquet'") with path.open("rt") as tf: columns = [self.lhs_col, self.rhs_col] if self.rel_col is not None: columns.append(self.rel_col) for row in parquet.reader(tf, columns=columns): if self.rel_col is not None: yield row else: yield (row[0], row[1], None)
def import_from_parquet(filename_or_fobj, *args, **kwargs): """Import data from a Parquet file and return with rows.Table.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {"imported_from": "parquet", "filename": filename} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def import_from_parquet(filename_or_fobj, *args, **kwargs): """Import data from a Parquet file and return with rows.Table.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") # TODO: should look into `schema.converted_type` also types = OrderedDict( [ (schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None ] ) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {"imported_from": "parquet", "filename": filename} return create_table( [header] + table_rows, meta=meta, force_types=types, *args, **kwargs )
def import_from_parquet(filename_or_fobj, *args, **kwargs): 'Import data from a Parquet file' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = { 'imported_from': 'parquet', 'filename': filename, } return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def read(self, path: Path): try: import parquet except ImportError as e: raise ImportError(f"{e}. HINT: You can install Parquet by running " "'pip install parquet'") with path.open("rb") as tf: columns = [ self.lhs_col, self.rhs_col, self.rel_col, self.weight_col ] fetch_columns = [c for c in columns if c is not None] for row in parquet.reader(tf, columns=fetch_columns): offset = 0 ret = [] for c in columns: if c is not None: ret.append(row[offset]) offset += 1 else: ret.append(None) yield tuple(ret)
#!/usr/bin/python import parquet as pq import sys from io import BytesIO # Purpose: Read parquet files fro stdin to pipeline in MemSQL # Read binary input from stdin # Python 2 #file = sys.stdin.read() # Python 3 file = sys.stdin.buffer.read() # Convert file object data = BytesIO(file) # Read the rows in the parquet file # Print out CSV to MemSQL for row in pq.reader(data): memsql_row = (str(row[0]) + ',' + str(row[1]) + ',' + str(row[2]) + ',' + str(row[3])) print(memsql_row.encode('ascii'))
import parquet import pandas as pd home = os.path.expanduser("~") dir = "/media/sumeyer/SSD_2/ML_DATA/" filename = "part-r-00000-67ebd6f0-bfb4-42e0-b516-d7aaa77cbcb8.snappy.parquet" datafile = dir + filename print("open file : ", datafile) ## assuming parquet file with two rows and three columns: ## foo bar baz ## 1 2 3 ## 4 5 6 with open(datafile) as fo: # prints: # {"foo": 1, "bar": 2} # {"foo": 4, "bar": 5} for row in parquet.DictReader(fo): print(json.dumps(row)) with open(datafile) as fo: # prints: # 1,2 # 4,5 for row in parquet.reader(fo): print(",".join([str(r) for r in row])) print(df.info()) print(df)
# https://github.com/jcrobak/parquet-python # sudo pip3 install parquet import parquet import json ## assuming parquet file with two rows and three columns: ## foo bar baz ## 1 2 3 ## 4 5 6 with open("test.parquet") as fo: # prints: # {"foo": 1, "bar": 2} # {"foo": 4, "bar": 5} for row in parquet.DictReader(fo, columns=['foo', 'bar']): print(json.dumps(row)) with open("test.parquet") as fo: # prints: # 1,2 # 4,5 for row in parquet.reader(fo, columns=['foo', 'bar]): print(",".join([str(r) for r in row]))
# content = fs.read() #s = str(content, 'utf-8') #file = open("D:\data.csv", "w") #file.write(s) #df = pd.read_csv("D:\data.csv", names=COLUMNNAMES) #print df import parquet import json ## assuming parquet file with two rows and three columns: ## foo bar baz ## 1 2 3 ## 4 5 6 with open("test.parquet") as fo: # prints: # {"foo": 1, "bar": 2} # {"foo": 4, "bar": 5} for row in parquet.DictReader(fo, columns=['foo', 'bar']): print(json.dumps(row)) with open("test.parquet") as fo: # prints: # 1,2 # 4,5 for row in parquet.reader(fo, columns=['foo', 'bar]): print(",".join([str(r) for r in row]))