def test_open_dataset_partitioned_directory(tempdir): import pyarrow.parquet as pq table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) for part in range(3): path = tempdir / "part={0}".format(part) path.mkdir() pq.write_table(table, path / "test.parquet") # no partitioning specified, just read all individual files full_table = pa.concat_tables([table] * 3) _check_dataset_from_path(tempdir, full_table) # specify partition scheme with discovery dataset = ds.dataset(str(tempdir), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema, check_metadata=False) # specify partition scheme with string short-cut dataset = ds.dataset(str(tempdir), partitioning="hive") assert dataset.schema.equals(expected_schema, check_metadata=False) # specify partition scheme with explicit scheme dataset = ds.dataset(str(tempdir), partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema, check_metadata=False) result = dataset.new_scan().finish().to_table() expected = full_table.append_column( "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8())) assert result.replace_schema_metadata().equals(expected)
def test_partitioning_function(): schema = pa.schema([("year", pa.int16()), ("month", pa.int8())]) names = ["year", "month"] # default DirectoryPartitioning part = ds.partitioning(schema) assert isinstance(part, ds.DirectoryPartitioning) part = ds.partitioning(names) assert isinstance(part, ds.PartitioningFactory) # needs schema or names with pytest.raises(ValueError): ds.partitioning() # Hive partitioning part = ds.partitioning(schema, flavor="hive") assert isinstance(part, ds.HivePartitioning) part = ds.partitioning(flavor="hive") assert isinstance(part, ds.PartitioningFactory) # cannot pass list of names with pytest.raises(ValueError): ds.partitioning(names, flavor="hive") # unsupported flavor with pytest.raises(ValueError): ds.partitioning(schema, flavor="unsupported")
def to_pyarrow_dataset( self, partitions: Optional[List[Tuple[str, str, Any]]] = None, filesystem: Optional[FileSystem] = None, ) -> pyarrow.dataset.Dataset: """ Build a PyArrow Dataset using data from the DeltaTable. :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax :param filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem :return: the PyArrow dataset in PyArrow """ if not partitions: file_paths = self._table.file_uris() else: file_paths = self._table.files_by_partitions(partitions) paths = [urlparse(curr_file) for curr_file in file_paths] empty_delta_table = len(paths) == 0 if empty_delta_table: return dataset( [], schema=self.pyarrow_schema(), partitioning=partitioning(flavor="hive"), ) # Decide based on the first file, if the file is on cloud storage or local if paths[0].netloc: query_str = "" # pyarrow doesn't properly support the AWS_ENDPOINT_URL environment variable # for non-AWS S3 like resources. This is a slight hack until such a # point when pyarrow learns about AWS_ENDPOINT_URL endpoint_url = os.environ.get("AWS_ENDPOINT_URL") if endpoint_url: endpoint = urlparse(endpoint_url) # This format specific to the URL schema inference done inside # of pyarrow, consult their tests/dataset.py for examples query_str += ( f"?scheme={endpoint.scheme}&endpoint_override={endpoint.netloc}" ) if not filesystem: filesystem = f"{paths[0].scheme}://{paths[0].netloc}{query_str}" keys = [curr_file.path for curr_file in paths] return dataset( keys, schema=self.pyarrow_schema(), filesystem=filesystem, partitioning=partitioning(flavor="hive"), ) else: return dataset( file_paths, schema=self.pyarrow_schema(), format="parquet", filesystem=filesystem, partitioning=partitioning(flavor="hive"), )
def to_pyarrow_dataset( self, partitions: Optional[List[Tuple]] = None ) -> pyarrow.dataset.Dataset: """ Build a PyArrow Dataset using data from the DeltaTable. :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax :return: the PyArrow dataset in PyArrow """ if partitions is None: file_paths = self._table.file_paths() else: table_path = self._table.table_path() file_paths = [ f"{table_path}/{file_name}" for file_name in self._table.files_by_partitions(partitions) ] paths = [urlparse(curr_file) for curr_file in file_paths] # Decide based on the first file, if the file is on cloud storage or local if paths[0].netloc: query_str = "" # pyarrow doesn't properly support the AWS_ENDPOINT_URL environment variable # for non-AWS S3 like resources. This is a slight hack until such a # point when pyarrow learns about AWS_ENDPOINT_URL endpoint_url = os.environ.get("AWS_ENDPOINT_URL") if endpoint_url is not None: endpoint = urlparse(endpoint_url) # This format specific to the URL schema inference done inside # of pyarrow, consult their tests/dataset.py for examples query_str += ( f"?scheme={endpoint.scheme}&endpoint_override={endpoint.netloc}" ) keys = [curr_file.path for curr_file in paths] return dataset( keys, schema=self.pyarrow_schema(), filesystem=f"{paths[0].scheme}://{paths[0].netloc}{query_str}", partitioning=partitioning(flavor="hive"), ) else: return dataset( file_paths, schema=self.pyarrow_schema(), format="parquet", partitioning=partitioning(flavor="hive"), )
def from_path(cls, name: str, path: str, trial, sinks=None, partitioning=None) -> 'ClientFunctionModel': """ Create a command line interface client for a meillionen model :param name: name of model. used for storage :param path: local path to the model. supports finding models accessible the PATH environment variable :return: A client to call a meillionen model """ if sinks is None: sinks = {} if partitioning is None: partitioning = dataset.partitioning(pa.schema([])) response = client_create_interface_from_cli(path) interface = FuncInterfaceClient.from_recordbatch(name=name, recordbatch=response) sinks = cls.infer_sinks(interface, sinks) return cls(interface=interface, path=path, sinks=sinks, trial=trial, partitioning=partitioning)
def test_with_partition_pruning(): if skip: return filter_expression = ((ds.field('tip_amount') > 10) & (ds.field('payment_type') > 2) & (ds.field('VendorID') > 1)) projection_cols = ['payment_type', 'tip_amount', 'VendorID'] partitioning = ds.partitioning(pa.schema([("payment_type", pa.int32()), ("VendorID", pa.int32())]), flavor="hive") rados_parquet_dataset = ds.dataset( "file:///mnt/cephfs/nyc/", partitioning=partitioning, format=ds.RadosParquetFileFormat("/etc/ceph/ceph.conf")) parquet_dataset = ds.dataset("file:///mnt/cephfs/nyc/", partitioning=partitioning, format="parquet") rados_parquet_df = rados_parquet_dataset.to_table( columns=projection_cols, filter=filter_expression).to_pandas() parquet_df = parquet_dataset.to_table( columns=projection_cols, filter=filter_expression).to_pandas() assert rados_parquet_df.equals(parquet_df) == 1
def test_fragments_implicit_cast(tempdir): # ARROW-8693 import pyarrow.parquet as pq table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part']) path = str(tempdir / "test_parquet_dataset") pq.write_to_dataset(table, path, partition_cols=["part"]) part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive") dataset = ds.dataset(path, format="parquet", partitioning=part) fragments = dataset.get_fragments(filter=ds.field("part") >= 2) assert len(list(fragments)) == 1
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].iloc[0]}-{df['ts_init'].iloc[-1]}" + "-{i}.parquet") # Write the actual file partitions = (ds.partitioning( schema=pa.schema( fields=[table.schema.field(c) for c in (partition_cols)]), flavor="hive", ) if partition_cols else None) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.0", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
def partitioning(self) -> t.Optional[pds.Partitioning]: if self.partition_cols is None: return None _schema = self.config.schema if _schema is None: raise e.code.CodingError( msgs=[ f"Ideally by now this should be set by now if not " f"available", f"That is possible on first write where table schema not " f"provided it is estimated while writing and stored in " f"config file" ] ) # noinspection PyUnresolvedReferences return pds.partitioning( _schema.empty_table().select(self.partition_cols).schema )
def write_deltalake( table_or_uri: Union[str, DeltaTable], data: Union[pd.DataFrame, pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], RecordBatchReader, ], schema: Optional[pa.Schema] = None, partition_by: Optional[List[str]] = None, filesystem: Optional[pa_fs.FileSystem] = None, mode: Literal["error", "append", "overwrite", "ignore"] = "error", file_options: Optional[ds.ParquetFileWriteOptions] = None, max_open_files: int = 1024, max_rows_per_file: int = 0, min_rows_per_group: int = 0, max_rows_per_group: int = 1048576, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, ) -> None: """Write to a Delta Lake table (Experimental) If the table does not already exist, it will be created. This function only supports protocol version 1 currently. If an attempting to write to an existing table with a higher min_writer_version, this function will throw DeltaTableProtocolError. Note that this function does NOT register this table in a data catalog. :param table_or_uri: URI of a table or a DeltaTable object. :param data: Data to write. If passing iterable, the schema must also be given. :param schema: Optional schema to write. :param partition_by: List of columns to partition the table by. Only required when creating a new table. :param filesystem: Optional filesystem to pass to PyArrow. If not provided will be inferred from uri. :param mode: How to handle existing data. Default is to error if table already exists. If 'append', will add new data. If 'overwrite', will replace table with new data. If 'ignore', will not write anything if table already exists. :param file_options: Optional write options for Parquet (ParquetFileWriteOptions). Can be provided with defaults using ParquetFileWriteOptions().make_write_options(). Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533 for the list of available options :param max_open_files: Limits the maximum number of files that can be left open while writing. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. :param max_rows_per_file: Maximum number of rows per file. If greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect max_open_files :param min_rows_per_group: Minimum number of rows per group. When the value is set, the dataset writer will batch incoming data and only write the row groups to the disk when sufficient rows have accumulated. :param max_rows_per_group: Maximum number of rows per group. If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. If this value is set, then min_rows_per_group should also be set. :param name: User-provided identifier for this table. :param description: User-provided description for this table. :param configuration: A map containing configuration options for the metadata action. """ if isinstance(data, pd.DataFrame): data = pa.Table.from_pandas(data) if schema is None: if isinstance(data, RecordBatchReader): schema = data.schema elif isinstance(data, Iterable): raise ValueError("You must provide schema if data is Iterable") else: schema = data.schema if isinstance(table_or_uri, str): table = try_get_deltatable(table_or_uri) table_uri = table_or_uri else: table = table_or_uri table_uri = table_uri = table._table.table_uri() # TODO: Pass through filesystem once it is complete # if filesystem is None: # filesystem = pa_fs.PyFileSystem(DeltaStorageHandler(table_uri)) if table: # already exists if mode == "error": raise AssertionError("DeltaTable already exists.") elif mode == "ignore": return current_version = table.version() if partition_by: assert partition_by == table.metadata().partition_columns if table.protocol().min_writer_version > 1: raise DeltaTableProtocolError( "This table's min_writer_version is " f"{table.protocol().min_writer_version}, " "but this method only supports version 1.") else: # creating a new table current_version = -1 # TODO: Don't allow writing to non-empty directory # Blocked on: Finish filesystem implementation in fs.py # assert len(filesystem.get_file_info(pa_fs.FileSelector(table_uri, allow_not_found=True))) == 0 if partition_by: partition_schema = pa.schema( [schema.field(name) for name in partition_by]) partitioning = ds.partitioning(partition_schema, flavor="hive") else: partitioning = None add_actions: List[AddAction] = [] def visitor(written_file: Any) -> None: partition_values = get_partitions_from_path(table_uri, written_file.path) stats = get_file_stats_from_metadata(written_file.metadata) add_actions.append( AddAction( written_file.path, written_file.metadata.serialized_size, partition_values, int(datetime.now().timestamp()), True, json.dumps(stats, cls=DeltaJSONEncoder), )) ds.write_dataset( data, base_dir=table_uri, basename_template=f"{current_version + 1}-{uuid.uuid4()}-{{i}}.parquet", format="parquet", partitioning=partitioning, # It will not accept a schema if using a RBR schema=schema if not isinstance(data, RecordBatchReader) else None, file_visitor=visitor, existing_data_behavior="overwrite_or_ignore", file_options=file_options, max_open_files=max_open_files, max_rows_per_file=max_rows_per_file, min_rows_per_group=min_rows_per_group, max_rows_per_group=max_rows_per_group, ) if table is None: _write_new_deltalake( # type: ignore[call-arg] table_uri, schema, add_actions, mode, partition_by or [], name, description, configuration, ) else: table._table.create_write_transaction( add_actions, mode, partition_by or [], )
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet" ) # Write the actual file partitions = ( ds.partitioning( schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]), flavor="hive", ) if partition_cols else None ) if pa.__version__ >= "6.0.0": kwargs.update(existing_data_behavior="overwrite_or_ignore") files = set(fs.glob(f"{path}/**")) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Ensure data written by write_dataset is sorted new_files = set(fs.glob(f"{path}/**/*.parquet")) - files del df for fn in new_files: ndf = pd.read_parquet(fs.open(fn)) # assert ndf.shape[0] == shape if "ts_init" in ndf.columns: ndf = ndf.sort_values("ts_init").reset_index(drop=True) pq.write_table( table=pa.Table.from_pandas(ndf), where=fn, filesystem=fs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
import pyarrow as pa import pyarrow.dataset as ds import pyarrow.parquet as pq format_ = ds.RadosParquetFileFormat("/etc/ceph/ceph.conf", "cephfs-data0") partitioning_ = ds.partitioning(pa.schema([("payment_type", pa.int32()), ("VendorID", pa.int32())]), flavor="hive") dataset_ = ds.dataset("file:///mnt/cephfs/nyc", partitioning=partitioning_, format=format_) print( dataset_.to_table(columns=['total_amount', 'DOLocationID', 'payment_type'], filter=(ds.field('payment_type') > 2)).to_pandas())