def _read_csv(self) -> "Table": import pyarrow as pa from pyarrow import csv if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") if not self.output_location.endswith((".csv", ".txt")): return pa.Table.from_pydict(dict()) length = self._get_content_length() if length and self.output_location.endswith(".txt"): description = self.description if self.description else [] column_names = [d[0] for d in description] read_opts = csv.ReadOptions( skip_rows=0, column_names=column_names, block_size=self._block_size, use_threads=True, ) parse_opts = csv.ParseOptions( delimiter="\t", quote_char=False, double_quote=False, escape_char=False, ) elif length and self.output_location.endswith(".csv"): read_opts = csv.ReadOptions(skip_rows=0, block_size=self._block_size, use_threads=True) parse_opts = csv.ParseOptions( delimiter=",", quote_char='"', double_quote=True, escape_char=False, ) else: return pa.Table.from_pydict(dict()) bucket, key = parse_output_location(self.output_location) try: return csv.read_csv( self._fs.open_input_stream(f"{bucket}/{key}"), read_options=read_opts, parse_options=parse_opts, convert_options=csv.ConvertOptions( quoted_strings_can_be_null=False, timestamp_parsers=self.timestamp_parsers, column_types=self.column_types, ), ) except Exception as e: _logger.exception(f"Failed to read {bucket}/{key}.") raise OperationalError(*e.args) from e
def _read_table_arrow(self, source: tp.BinaryIO, schema: pa.Schema) -> pa.Table: try: read_options = pa_csv.ReadOptions() read_options.encoding = 'utf-8' read_options.use_threads = False parse_options = pa_csv.ParseOptions() parse_options.newlines_in_values = True convert_options = pa_csv.ConvertOptions() convert_options.include_columns = schema.names convert_options.column_types = { n: t for (n, t) in zip(schema.names, schema.types) } convert_options.strings_can_be_null = True convert_options.quoted_strings_can_be_null = False return pa_csv.read_csv(source, read_options, parse_options, convert_options) except pa.ArrowInvalid as e: err = f"CSV file decoding failed, content is garbled" self._log.exception(err) raise _ex.EDataCorruption(err) from e except pa.ArrowKeyError as e: err = f"CSV file decoding failed, one or more columns is missing" self._log.error(err) self._log.exception(str(e)) raise _ex.EDataCorruption(err) from e
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd', version="2.0", use_dictionary=True) stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) for batch in stream_reader: table = pa.Table.from_batches([batch]) parquet_writer.write_table(table) parquet_writer.close()
def pa_read_options(self): read_options = self.read_options or pac.ReadOptions() if self.skip_rows is not None: read_options.skip_rows = self.skip_rows if self.column_names is not None: read_options.column_names = self.column_names if self.autogenerate_column_names is not None: read_options.autogenerate_column_names = self.autogenerate_column_names return read_options
def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False): ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html ' if not ARROW: return "Optional pyarrow dependency not found. To install: pip3 install pyarrow" sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } start = time.time() # Get table metadata con = con or self con.execute(f'select * from {table_name} where 1=0') # Map column names to pyarrow types and set Arrow's CSV parameters sqream_col_types = [col_type[0] for col_type in con.col_type_tups] column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups]) read = read or csv.ReadOptions(column_names=con.col_names) parse = parse or csv.ParseOptions(delimiter='|') convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types) # Read CSV to in-memory arrow format csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks() num_chunks = len(csv_arrow[0].chunks) numpy_cols = [] # For each column, get the numpy representation for quick packing for col_type, col in zip(sqream_col_types, csv_arrow): # Only one chunk after combine_chunks() col = col.chunks[0] if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'): col = col.to_pandas() else: col = col.to_numpy() numpy_cols.append(col) print (f'total loading csv: {time.time()-start}') start = time.time() # Insert columns into SQream col_num = csv_arrow.shape[1] con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols) print (f'total inserting csv: {time.time()-start}')
def pa_read_options(self): if self.read_options is not None: read_options = self.read_options else: read_options = pac.ReadOptions(column_names=["text"]) if self.encoding is not None: read_options.encoding = self.encoding if self.block_size is not None: read_options.block_size = self.block_size if self.use_threads is not None: read_options.use_threads = self.use_threads return read_options
def read_product(self, keep_groups=None, drop_groups=None, keep_modules=None, drop_modules=None): prod_cols = [ 'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code', 'product_module_descr', 'product_group_code', 'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi', 'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc', 'size1_change_flag_uc' ] prod_dict = { 'upc': pa.int64(), 'upc_ver_uc': pa.int8(), 'product_module_code': pa.uint16(), 'brand_code_uc': pa.uint32(), 'multi': pa.uint16(), 'size1_code_uc': pa.uint16() } prod_df = csv.read_csv(self.product_file, read_options=csv.ReadOptions(encoding='latin'), parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=prod_dict, include_columns=prod_cols)).to_pandas() if keep_groups: prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)] if drop_groups: prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)] if keep_modules: prod_df = prod_df[prod_df['product_module_code'].isin( keep_modules)] if drop_modules: prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules )] # dictionary encoding to save space prod_df['size1_units'] = prod_df['size1_units'].astype('category') prod_df['product_module_descr'] = prod_df[ 'product_module_descr'].astype('category') prod_df['product_group_code'] = prod_df['product_group_code'].astype( 'category') # clean up product info prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace( 'RTE', '') prod_df['brand_descr'] = prod_df['brand_descr'].str.strip( ).str.replace('CTL BR', 'Private Label') self.prod_df = prod_df.copy() return
def csv_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( csv.read_csv( f, read_options=csv.ReadOptions(use_threads=False), **arrow_csv_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths)
def _read_options_from_dict(**kwargs): """Returns the read options for CSV. Returns: (object) A pyarrow ReadOptions object. """ return csv.ReadOptions( use_threads=kwargs.pop('use_threads', True), block_size=kwargs.pop('block_size', 1073741824), skip_rows=kwargs.pop('skip_rows', 0), column_names=kwargs.pop('column_names', None), autogenerate_column_names=kwargs.pop('autogenerate_column_names', False), encoding=kwargs.pop('encoding', 'utf8'), )
def read_delta_file_envelopes( annotated_delta_manifests: List[Dict[str, Any]], column_names: List[str], primary_keys: List[str], sort_keys: List[str], deltacat_storage=unimplemented_deltacat_storage) \ -> Optional[List[Dict[str, Any]]]: tables_and_annotations = [] columns_to_read = list(chain(primary_keys, sort_keys)) for annotated_delta_manifest in annotated_delta_manifests: tables = deltacat_storage.download_delta_manifest( annotated_delta_manifest, file_reader_kwargs={ CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.CSV.value]: { pacsv.ReadOptions(column_names=column_names), pacsv.ConvertOptions(include_columns=columns_to_read) }, CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.PARQUET.value]: { "columns": columns_to_read }, CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.FEATHER.value]: { "columns": columns_to_read }, }, ) annotations = dma.get_annotations(annotated_delta_manifest) assert ( len(tables) == len(annotations), f"Unexpected Error: Length of downloaded delta manifest tables " f"({len(tables)}) doesn't match the length of delta manifest " f"annotations ({len(annotations)}).") tables_and_annotations.append((tables, annotations)) if not tables_and_annotations: return None delta_file_envelopes = [] for tables, annotations in tables_and_annotations: for i in range(len(tables)): delta_file = delta_file_envelope.of( dma.get_annotation_stream_position(annotations[i]), dma.get_annotation_file_index(annotations[i]), dma.get_annotation_delta_type(annotations[i]), tables[i], ) delta_file_envelopes.append(delta_file) return delta_file_envelopes
def func(id: int, conn: str, query: str) -> Any: engine = create_engine(conn) conn = engine.connect() cur = conn.connection.cursor() store = io.BytesIO() with Timer() as timer: cur.copy_expert(f"COPY ({query}) TO STDOUT WITH CSV HEADER;", store) print(f"[Copy {id}] {timer.elapsed:.2f}s") store.seek(0) with Timer() as timer: df = csv.read_csv(store, read_options=csv.ReadOptions(use_threads=False)) print(f"[Read CSV {id}] {timer.elapsed:.2f}s") return df
def _read_stream(self, f: "pyarrow.NativeFile", path: str, **reader_args) -> Iterator[Block]: import pyarrow from pyarrow import csv read_options = reader_args.pop("read_options", csv.ReadOptions(use_threads=False)) reader = csv.open_csv(f, read_options=read_options, **reader_args) schema = None while True: try: batch = reader.read_next_batch() table = pyarrow.Table.from_batches([batch], schema=schema) if schema is None: schema = table.schema yield table except StopIteration: return
def read_csv(self, filenames, delimiter=','): global parquet_writer for file in filenames: csv_reader = csv.open_csv( file, read_options=csv.ReadOptions(use_threads=True), parse_options=csv.ParseOptions(delimiter=delimiter), convert_options=csv.ConvertOptions(column_types=self.dtype)) parquet_writer = pq.ParquetWriter(self.parquet_file, csv_reader.schema) nrow = 0 for batch in csv_reader: batch_df = batch.to_pandas() nrow += batch_df.shape[0] parquet_writer.write_table(pa.Table.from_pandas(df=batch_df)) parquet_writer.close() return ds.dataset(self.parquet_file, format="parquet")
def csv_to_parquet( csv_file: Path, parquet_file: Path, *, delimiter: str, column_names: List[str], quiet: bool = False, ) -> None: block_size = 1 << 24 # 16 MB read_options = csv.ReadOptions(column_names=column_names, block_size=block_size) parse_options = csv.ParseOptions(delimiter=delimiter) writer = None with csv.open_csv( csv_file, read_options=read_options, parse_options=parse_options ) as csv_reader: for batch in tqdm(csv_reader, disable=quiet): if writer is None: writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd") table = pa.Table.from_batches([batch]) writer.write_table(table) if writer is not None: writer.close()
def s3_file_to_table(s3_url: str, content_type: str, content_encoding: str, pa_read_func_kwargs: Optional[Dict[str, Any]] = None, **s3_client_kwargs) -> pa.Table: logger.debug(f"Reading {s3_url} to PyArrow. Content type: {content_type}. " f"Encoding: {content_encoding}") s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs) logger.debug(f"Read S3 object from {s3_url}: {s3_obj}") pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type] input_file_init = ENCODING_TO_FILE_INIT[content_encoding] input_file = input_file_init(fileobj=io.BytesIO(s3_obj['Body'].read())) args = [input_file] kwargs = CONTENT_TYPE_TO_READER_KWARGS[content_type] if pa_read_func_kwargs is None: pa_read_func_kwargs = {} if content_type in DELIMITED_TEXT_CONTENT_TYPES: # ReadOptions can't be included in CONTENT_TYPE_TO_KWARGS because it doesn't pickle: # File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 563, in dump # return Pickler.dump(self, obj) # File "stringsource", line 2, in pyarrow._csv.ReadOptions.__reduce_cython__ # TypeError: self.options cannot be converted to a Python object for pickling logger.debug(f"{content_type} is a delimited text content type") kwargs["read_options"] = pacsv.ReadOptions( autogenerate_column_names=True) if pa_read_func_kwargs: kwargs.update( pa_read_func_kwargs.get( CONTENT_TYPE_TO_USER_KWARGS_KEY[content_type])) table, latency = timed_invocation(pa_read_func, *args, **kwargs) # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16: # https://issues.apache.org/jira/browse/ARROW-7811 # if content_type == DatasetConstants.ContentType.ORC: # result = result.read() logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s") return table
def test_csv_options(in_type, pd_old_type, pd_new_type): schema = pa.schema([("string_col", pa.string())]) read_options = csv.ReadOptions(skip_rows=1) parse_options = csv.ParseOptions(quote_char="'", escape_char="\\", delimiter=";", newlines_in_values=True) convert_options = csv.ConvertOptions( include_columns=["i", "my_string", "nonexistent_column"], include_missing_columns=True, null_values=["NULL_STRING"], strings_can_be_null=True, ) df = pa_read_csv_to_pandas( "tests/data/csv_options_test.csv", schema, False, pd_string=False, parse_options=parse_options, convert_options=convert_options, read_options=read_options, ) expected = [ "dsfasd;dsffadsf", "dsfasd;dsffadsf", None, "this text\nhas a line break", "this text, like so, has commas", ] assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"] assert df["nonexistent_column"].isnull().all() assert_series_equal(df["my_string"], Series(expected, name="my_string"))
def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): from pyarrow import csv read_options = reader_args.pop("read_options", csv.ReadOptions(use_threads=False)) return csv.read_csv(f, read_options=read_options, **reader_args)
if __name__ == "__main__": args = docopt(__doc__, version="1.0") conn = os.environ["POSTGRES_URL"] table = os.environ["POSTGRES_TABLE"] engine = create_engine(conn) conn = engine.connect() cur = conn.connection.cursor() store = io.BytesIO() with Timer() as timer: cur.copy_expert( f"COPY (SELECT * FROM {table}) TO STDOUT WITH CSV HEADER;", store) print(f"[Copy] {timer.elapsed:.2f}s") store.seek(0) with Timer() as timer: df = csv.read_csv(store, read_options=csv.ReadOptions(use_threads=False)) print(f"[Read CSV] {timer.elapsed:.2f}s") with Timer() as timer: df = df.to_pandas() print(f"[To Pandas] {timer.elapsed:.2f}s") conn.close() print(df.head()) # _, peak = tracemalloc.get_traced_memory() # print(f"memory peak: {peak/10**9:.2f}G")
import pyarrow as pa import pyarrow.csv as pv import pyarrow.parquet as pq from datetime import datetime csv_filename = "accumulated_data_300_million_rows_converted.csv" parquet_filename = '../data/' + csv_filename.replace('csv', 'parquet') parquet_partition_name = '../data/' + csv_filename.replace('.csv', '') print("Start ", datetime.now()) # ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=[ "unit_id", "value", "start", "stop", "start_year", "start_unix_days", "stop_unix_days" ]) # ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions csv_parse_options = pv.ParseOptions(delimiter=';') # Types: https://arrow.apache.org/docs/python/api/datatypes.html # TODO nullable parameter does not work as expected! data_schema = pa.schema([ pa.field(name='start_year', type=pa.string(), nullable=True), pa.field(name='unit_id', type=pa.uint64(), nullable=False), pa.field(name='value', type=pa.string(), nullable=False), pa.field(name='start_epoch_days', type=pa.int16(), nullable=True), pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True),
from pyarrow import csv as pacsv filename = "test_input.txt" if __name__ == "__main__": read_options = pacsv.ReadOptions( column_names=["group_id", "seq_number", "data"]) parse_options = pacsv.ParseOptions(delimiter="\t") table = pacsv.read_csv(filename, read_options=read_options, parse_options=parse_options)
from pyarrow import csv from timeit import default_timer as timer import sys warmup_filename = sys.argv[0] filename = sys.argv[1] start = timer() table = csv.read_csv( warmup_filename, read_options=csv.ReadOptions(use_threads=False)).to_pandas() end = timer() t1 = end - start start = timer() table = csv.read_csv( filename, read_options=csv.ReadOptions(use_threads=False)).to_pandas() end = timer() t2 = end - start start = timer() table = csv.read_csv( filename, read_options=csv.ReadOptions(use_threads=False)).to_pandas() end = timer() t3 = end - start print(t1) print(t2) print(t3) print('NaN') print('NaN')
import pyarrow as pa import pyarrow.csv as pv import pyarrow.parquet as pq from pyarrow.lib import Table csv = 'accumulated_data_300_million_rows_id_filter.csv' target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet' csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=["unit_id"]) # Types: https://arrow.apache.org/docs/python/api/datatypes.html data_schema = pa.schema([('unit_id', pa.uint64())]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema) table: Table = pv.read_csv(input_file=csv, read_options=csv_read_options, convert_options=csv_convert_options) pq.write_table(table, target_file) print('Generated file with the following:') print('Parquet metadata: ' + str(pq.read_metadata(target_file))) print('Parquet schema: ' + pq.read_schema(target_file).to_string())
def pa_read_options(self): read_options = self.read_options or pac.ReadOptions() read_options.skip_rows = self.skip_rows read_options.autogenerate_column_names = not self.header_as_column_names return read_options
def convert_csv_to_parquet(csv_file: str, parquet_dir: str, partitioned: bool): print("Start ", datetime.now()) print(csv_file) print(parquet_dir) print("Abs path of csv file: " + os.path.abspath(csv_file)) #Remove old partitions if partitioned: if Path(parquet_dir).is_dir(): shutil.rmtree(parquet_dir) # ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=[ "unit_id", "value", "start", "stop", "start_year", "start_epoch_days", "stop_epoch_days" ]) # ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions csv_parse_options = pv.ParseOptions(delimiter=';') # Types: https://arrow.apache.org/docs/python/api/datatypes.html # TODO nullable parameter does not work as expected! data_schema = pa.schema([ pa.field(name='start_year', type=pa.string(), nullable=True), pa.field(name='unit_id', type=pa.uint64(), nullable=False), pa.field(name='value', type=pa.string(), nullable=False), pa.field(name='start_epoch_days', type=pa.int16(), nullable=True), pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True), ]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema) #include_columns=["start_year", "unit_id", "value", "start_epoch_days", "stop_epoch_days"]) # read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv table = pv.read_csv(input_file=csv_file, read_options=csv_read_options, parse_options=csv_parse_options, convert_options=csv_convert_options) # print('Bytes: ' + str(table.nbytes)) # print('Rows: ' + str(table.num_rows)) # print('Schema: ' + str(table.schema)) # print('Column names: ' + str(table.column_names)) # pandas.set_option('max_columns', None) # print all columns # print(table.to_pandas().head(10)) # write with partitions if partitioned: pq.write_to_dataset(table, root_path=parquet_dir, partition_cols=['start_year']) else: pq.write_to_dataset(table, root_path=parquet_dir) print("End ", datetime.now())