def scan_file(self, bucket, key, schema): logging.info(f"delim is {self.delimiter}") uri = f"{bucket}/{key}" s3fs = fs.S3FileSystem() # Run column order validation by opening and not reading anything. filestream = s3fs.open_input_stream(uri) parse_opts = csv.ParseOptions(delimiter=self.delimiter) reader = csv.open_csv(filestream, parse_options=parse_opts) for index, col in enumerate(reader.schema): if col.name != schema[index].name: msg = "column {} is out of order".format(col.name) raise ColumnOrderException(msg) # Run the rest of the validations. filestream = s3fs.open_input_stream(uri) opts = csv.ConvertOptions(column_types=schema) reader = csv.open_csv(filestream, convert_options=opts, parse_options=parse_opts) # Kind of a hack, but it works...if delim wrong, everything is read # as one column. if len(schema) > 1 and len(reader.schema) == 1: raise WrongDelimiterException() # Parse through the file, pyarrow will through exceptions # if there's invalid data. for batch in reader: # If primary key is a string, need to check the column # for empty strings. if schema.field(self.primary_key).type == "string": table = pyarrow.Table.from_batches([batch]) for val in table[self.primary_key]: if val.as_py() == "": raise EmptyPrimaryKeyException()
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd', version="2.0", use_dictionary=True) stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) for batch in stream_reader: table = pa.Table.from_batches([batch]) parquet_writer.write_table(table) parquet_writer.close()
def stream_records( self, file: Union[TextIO, BinaryIO]) -> Iterator[Mapping[str, Any]]: """ https://arrow.apache.org/docs/python/generated/pyarrow.csv.open_csv.html PyArrow returns lists of values for each column so we zip() these up into records which we then yield """ streaming_reader = pa_csv.open_csv( file, pa.csv.ReadOptions(**self._read_options()), pa.csv.ParseOptions(**self._parse_options()), pa.csv.ConvertOptions( **self._convert_options(self._master_schema)), ) still_reading = True while still_reading: try: batch = streaming_reader.read_next_batch() except StopIteration: still_reading = False else: batch_dict = batch.to_pydict() batch_columns = [col_info.name for col_info in batch.schema] # this gives us a list of lists where each nested list holds ordered values for a single column # e.g. [ [1,2,3], ["a", "b", "c"], [True, True, False] ] columnwise_record_values = [ batch_dict[column] for column in batch_columns ] # we zip this to get row-by-row, e.g. [ [1, "a", True], [2, "b", True], [3, "c", False] ] for record_values in zip(*columnwise_record_values): # create our record of {col: value, col: value} by dict comprehension, iterating through all cols in batch_columns yield { batch_columns[i]: record_values[i] for i in range(len(batch_columns)) }
def csv_reader_infer_nb_arrow_type( filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False ): read_opts, parse_opts, convert_opts = get_pyarrow_read_csv_options( delimiter, names, usecols, dtype, skiprows, parse_dates) csv_reader = csv.open_csv(filepath_or_buffer, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) table_schema = csv_reader.schema nb_arrow_column_types = [] for i, pa_data_type in enumerate(table_schema.types): nb_type = numpy_support.from_dtype(pa_data_type.to_pandas_dtype()) if isinstance(nb_type, types.PyObject): if pa_data_type == pa.string(): nb_type = StdStringViewType() else: raise TypingError("Cannot infer numba type for: ", pa_data_type, f"of column={table_schema.names[i]}") nb_arrow_column_types.append(nb_type) table_column_names = table_schema.names if not names else (names if usecols is None else usecols) arrow_table_type = ArrowTableType(nb_arrow_column_types, table_column_names) return arrow_table_type
def _csv_to_table(file, metadata=None, lat=None, lon=None, geom=None, crs=None, **kwargs): """Yields an arrow table from a stream of CSV data. Parameters: file (string): The full path of the input file. metadata (dict): Metadata to be written in the arrow table. lat (string): The column name of latitude (applies only to CSV). lon (string): The column name of longitude (applies only to CSV). geom (string): The column name of WKT geometry (applies only to CSV). crs (string): The dataset native crs (default: read from file). **kwargs: Extra keyword arguments used by the CSV reader (see https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html). Yields: (object) Arrow table with spatial features. """ parse_options = _parse_options_from_dict(**kwargs) read_options = _read_options_from_dict(**kwargs) convert_options = _convert_options_from_dict(**kwargs) if lat is not None and lon is not None: type_of_geom = 'latlon' elif geom is not None: type_of_geom = 'wkt' else: type_of_geom = None batches = csv.open_csv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) if type_of_geom is None: type_of_geom, geom, lat, lon = _get_geom_info(batches.schema.names) print('Opened file %s, using pyarrow CSV reader.' % (os.path.basename(file))) eof = False while not eof: try: batch = batches.read_next_batch() except StopIteration: eof = True else: table = pa.Table.from_batches([batch]) try: if type_of_geom == 'latlon': table = _geometry_from_latlon(table, lat, lon, crs=crs) else: table = _geometry_from_wkt(table, geom, crs=crs) # Not spatial file except TypeError: pass except KeyError: pass else: table = table.replace_schema_metadata(metadata=metadata) yield table
def _read_stream(self, f: "pyarrow.NativeFile", path: str, **reader_args) -> Iterator[Block]: import pyarrow from pyarrow import csv read_options = reader_args.pop("read_options", csv.ReadOptions(use_threads=False)) reader = csv.open_csv(f, read_options=read_options, **reader_args) schema = None while True: try: batch = reader.read_next_batch() table = pyarrow.Table.from_batches([batch], schema=schema) if schema is None: schema = table.schema yield table except StopIteration: return
def read_csv(self, filenames, delimiter=','): global parquet_writer for file in filenames: csv_reader = csv.open_csv( file, read_options=csv.ReadOptions(use_threads=True), parse_options=csv.ParseOptions(delimiter=delimiter), convert_options=csv.ConvertOptions(column_types=self.dtype)) parquet_writer = pq.ParquetWriter(self.parquet_file, csv_reader.schema) nrow = 0 for batch in csv_reader: batch_df = batch.to_pandas() nrow += batch_df.shape[0] parquet_writer.write_table(pa.Table.from_pandas(df=batch_df)) parquet_writer.close() return ds.dataset(self.parquet_file, format="parquet")
def csv_to_parquet( csv_file: Path, parquet_file: Path, *, delimiter: str, column_names: List[str], quiet: bool = False, ) -> None: block_size = 1 << 24 # 16 MB read_options = csv.ReadOptions(column_names=column_names, block_size=block_size) parse_options = csv.ParseOptions(delimiter=delimiter) writer = None with csv.open_csv( csv_file, read_options=read_options, parse_options=parse_options ) as csv_reader: for batch in tqdm(csv_reader, disable=quiet): if writer is None: writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd") table = pa.Table.from_batches([batch]) writer.write_table(table) if writer is not None: writer.close()
v.collapse([2, 3]) v.levels v.expand(2) x = np.random.choice(list("abdcde"), size=10000, replace=True) v.to_index(pd.Series(x)) v.to_categorical(pd.Series(x)) v.to_sparse(pd.Series(x)) ## cycler import io data = """ v1,v2,v3 1,2,3 4,5,6 7,8,9 """ with open('test.csv', 'w') as fout: fout.write(data) from pyarrow import csv opts = csv.ConvertOptions csv.read_csv('test.csv', ) import pyarrow.dataset as ds for chunk in csv.open_csv('test.csv'): ds.dataset(chunk)
def open_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = False return open_csv(*args, **kwargs)