Ejemplo n.º 1
0
 def scan_file(self, bucket, key, schema):
     logging.info(f"delim is {self.delimiter}")
     uri = f"{bucket}/{key}"
     s3fs = fs.S3FileSystem()
     # Run column order validation by opening and not reading anything.
     filestream = s3fs.open_input_stream(uri)
     parse_opts = csv.ParseOptions(delimiter=self.delimiter)
     reader = csv.open_csv(filestream, parse_options=parse_opts)
     for index, col in enumerate(reader.schema):
         if col.name != schema[index].name:
             msg = "column {} is out of order".format(col.name)
             raise ColumnOrderException(msg)
     # Run the rest of the validations.
     filestream = s3fs.open_input_stream(uri)
     opts = csv.ConvertOptions(column_types=schema)
     reader = csv.open_csv(filestream,
                           convert_options=opts,
                           parse_options=parse_opts)
     # Kind of a hack, but it works...if delim wrong, everything is read
     # as one column.
     if len(schema) > 1 and len(reader.schema) == 1:
         raise WrongDelimiterException()
     # Parse through the file, pyarrow will through exceptions
     # if there's invalid data.
     for batch in reader:
         # If primary key is a string, need to check the column
         # for empty strings.
         if schema.field(self.primary_key).type == "string":
             table = pyarrow.Table.from_batches([batch])
             for val in table[self.primary_key]:
                 if val.as_py() == "":
                     raise EmptyPrimaryKeyException()
Ejemplo n.º 2
0
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        arrow_schema = pa.schema(get_fields(table))
        column_names = [name for name, dtype in get_fields(table)]

        read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
        parse_options = pcsv.ParseOptions(newlines_in_values=True)
        convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
                                              true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)

        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd',
                                          version="2.0", use_dictionary=True)
        stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options,
                                      convert_options=convert_options)
        for batch in stream_reader:
            table = pa.Table.from_batches([batch])
            parquet_writer.write_table(table)
        parquet_writer.close()
Ejemplo n.º 3
0
 def stream_records(
         self, file: Union[TextIO,
                           BinaryIO]) -> Iterator[Mapping[str, Any]]:
     """
     https://arrow.apache.org/docs/python/generated/pyarrow.csv.open_csv.html
     PyArrow returns lists of values for each column so we zip() these up into records which we then yield
     """
     streaming_reader = pa_csv.open_csv(
         file,
         pa.csv.ReadOptions(**self._read_options()),
         pa.csv.ParseOptions(**self._parse_options()),
         pa.csv.ConvertOptions(
             **self._convert_options(self._master_schema)),
     )
     still_reading = True
     while still_reading:
         try:
             batch = streaming_reader.read_next_batch()
         except StopIteration:
             still_reading = False
         else:
             batch_dict = batch.to_pydict()
             batch_columns = [col_info.name for col_info in batch.schema]
             # this gives us a list of lists where each nested list holds ordered values for a single column
             # e.g. [ [1,2,3], ["a", "b", "c"], [True, True, False] ]
             columnwise_record_values = [
                 batch_dict[column] for column in batch_columns
             ]
             # we zip this to get row-by-row, e.g. [ [1, "a", True], [2, "b", True], [3, "c", False] ]
             for record_values in zip(*columnwise_record_values):
                 # create our record of {col: value, col: value} by dict comprehension, iterating through all cols in batch_columns
                 yield {
                     batch_columns[i]: record_values[i]
                     for i in range(len(batch_columns))
                 }
Ejemplo n.º 4
0
def csv_reader_infer_nb_arrow_type(
    filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
):

    read_opts, parse_opts, convert_opts = get_pyarrow_read_csv_options(
                                                delimiter, names, usecols, dtype, skiprows, parse_dates)
    csv_reader = csv.open_csv(filepath_or_buffer,
                              read_options=read_opts,
                              parse_options=parse_opts,
                              convert_options=convert_opts)

    table_schema = csv_reader.schema

    nb_arrow_column_types = []
    for i, pa_data_type in enumerate(table_schema.types):
        nb_type = numpy_support.from_dtype(pa_data_type.to_pandas_dtype())

        if isinstance(nb_type, types.PyObject):
            if pa_data_type == pa.string():
                nb_type = StdStringViewType()
            else:
                raise TypingError("Cannot infer numba type for: ", pa_data_type, f"of column={table_schema.names[i]}")

        nb_arrow_column_types.append(nb_type)

    table_column_names = table_schema.names if not names else (names if usecols is None else usecols)

    arrow_table_type = ArrowTableType(nb_arrow_column_types, table_column_names)
    return arrow_table_type
Ejemplo n.º 5
0
def _csv_to_table(file,
                  metadata=None,
                  lat=None,
                  lon=None,
                  geom=None,
                  crs=None,
                  **kwargs):
    """Yields an arrow table from a stream of CSV data.
    Parameters:
        file (string): The full path of the input file.
        metadata (dict): Metadata to be written in the arrow table.
        lat (string): The column name of latitude (applies only to CSV).
        lon (string): The column name of longitude (applies only to CSV).
        geom (string): The column name of WKT geometry (applies only to CSV).
        crs (string): The dataset native crs (default: read from file).
        **kwargs: Extra keyword arguments used by the CSV reader
            (see https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html).
    Yields:
        (object) Arrow table with spatial features.
    """
    parse_options = _parse_options_from_dict(**kwargs)
    read_options = _read_options_from_dict(**kwargs)
    convert_options = _convert_options_from_dict(**kwargs)
    if lat is not None and lon is not None:
        type_of_geom = 'latlon'
    elif geom is not None:
        type_of_geom = 'wkt'
    else:
        type_of_geom = None
    batches = csv.open_csv(file,
                           read_options=read_options,
                           parse_options=parse_options,
                           convert_options=convert_options)
    if type_of_geom is None:
        type_of_geom, geom, lat, lon = _get_geom_info(batches.schema.names)
    print('Opened file %s, using pyarrow CSV reader.' %
          (os.path.basename(file)))

    eof = False
    while not eof:
        try:
            batch = batches.read_next_batch()
        except StopIteration:
            eof = True
        else:
            table = pa.Table.from_batches([batch])
            try:
                if type_of_geom == 'latlon':
                    table = _geometry_from_latlon(table, lat, lon, crs=crs)
                else:
                    table = _geometry_from_wkt(table, geom, crs=crs)
            # Not spatial file
            except TypeError:
                pass
            except KeyError:
                pass
            else:
                table = table.replace_schema_metadata(metadata=metadata)
            yield table
Ejemplo n.º 6
0
    def _read_stream(self, f: "pyarrow.NativeFile", path: str,
                     **reader_args) -> Iterator[Block]:
        import pyarrow
        from pyarrow import csv

        read_options = reader_args.pop("read_options",
                                       csv.ReadOptions(use_threads=False))
        reader = csv.open_csv(f, read_options=read_options, **reader_args)
        schema = None
        while True:
            try:
                batch = reader.read_next_batch()
                table = pyarrow.Table.from_batches([batch], schema=schema)
                if schema is None:
                    schema = table.schema
                yield table
            except StopIteration:
                return
Ejemplo n.º 7
0
    def read_csv(self, filenames, delimiter=','):
        global parquet_writer
        for file in filenames:
            csv_reader = csv.open_csv(
                file,
                read_options=csv.ReadOptions(use_threads=True),
                parse_options=csv.ParseOptions(delimiter=delimiter),
                convert_options=csv.ConvertOptions(column_types=self.dtype))
            parquet_writer = pq.ParquetWriter(self.parquet_file,
                                              csv_reader.schema)

            nrow = 0
            for batch in csv_reader:
                batch_df = batch.to_pandas()
                nrow += batch_df.shape[0]
                parquet_writer.write_table(pa.Table.from_pandas(df=batch_df))

        parquet_writer.close()
        return ds.dataset(self.parquet_file, format="parquet")
Ejemplo n.º 8
0
def csv_to_parquet(
    csv_file: Path,
    parquet_file: Path,
    *,
    delimiter: str,
    column_names: List[str],
    quiet: bool = False,
) -> None:
    block_size = 1 << 24  # 16 MB
    read_options = csv.ReadOptions(column_names=column_names, block_size=block_size)
    parse_options = csv.ParseOptions(delimiter=delimiter)
    writer = None
    with csv.open_csv(
        csv_file, read_options=read_options, parse_options=parse_options
    ) as csv_reader:
        for batch in tqdm(csv_reader, disable=quiet):
            if writer is None:
                writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd")
            table = pa.Table.from_batches([batch])
            writer.write_table(table)
    if writer is not None:
        writer.close()
Ejemplo n.º 9
0
v.collapse([2, 3])
v.levels

v.expand(2)

x = np.random.choice(list("abdcde"), size=10000, replace=True)
v.to_index(pd.Series(x))
v.to_categorical(pd.Series(x))
v.to_sparse(pd.Series(x))



## cycler
import io
data = """
v1,v2,v3
1,2,3
4,5,6
7,8,9
"""

with open('test.csv', 'w') as fout:
    fout.write(data)

from pyarrow import csv
opts = csv.ConvertOptions
csv.read_csv('test.csv', )

import pyarrow.dataset as ds
for chunk in csv.open_csv('test.csv'):
    ds.dataset(chunk)
Ejemplo n.º 10
0
 def open_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = False
     return open_csv(*args, **kwargs)