Beispiel #1
0
def test_open_dataset_partitioned_directory(tempdir):
    import pyarrow.parquet as pq
    table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
    for part in range(3):
        path = tempdir / "part={0}".format(part)
        path.mkdir()
        pq.write_table(table, path / "test.parquet")

    # no partitioning specified, just read all individual files
    full_table = pa.concat_tables([table] * 3)
    _check_dataset_from_path(tempdir, full_table)

    # specify partition scheme with discovery
    dataset = ds.dataset(str(tempdir),
                         partitioning=ds.partitioning(flavor="hive"))
    expected_schema = table.schema.append(pa.field("part", pa.int32()))
    assert dataset.schema.equals(expected_schema, check_metadata=False)

    # specify partition scheme with string short-cut
    dataset = ds.dataset(str(tempdir), partitioning="hive")
    assert dataset.schema.equals(expected_schema, check_metadata=False)

    # specify partition scheme with explicit scheme
    dataset = ds.dataset(str(tempdir),
                         partitioning=ds.partitioning(pa.schema([("part",
                                                                  pa.int8())]),
                                                      flavor="hive"))
    expected_schema = table.schema.append(pa.field("part", pa.int8()))
    assert dataset.schema.equals(expected_schema, check_metadata=False)

    result = dataset.new_scan().finish().to_table()
    expected = full_table.append_column(
        "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()))
    assert result.replace_schema_metadata().equals(expected)
Beispiel #2
0
def test_partitioning_function():
    schema = pa.schema([("year", pa.int16()), ("month", pa.int8())])
    names = ["year", "month"]

    # default DirectoryPartitioning

    part = ds.partitioning(schema)
    assert isinstance(part, ds.DirectoryPartitioning)
    part = ds.partitioning(names)
    assert isinstance(part, ds.PartitioningFactory)
    # needs schema or names
    with pytest.raises(ValueError):
        ds.partitioning()

    # Hive partitioning

    part = ds.partitioning(schema, flavor="hive")
    assert isinstance(part, ds.HivePartitioning)
    part = ds.partitioning(flavor="hive")
    assert isinstance(part, ds.PartitioningFactory)
    # cannot pass list of names
    with pytest.raises(ValueError):
        ds.partitioning(names, flavor="hive")

    # unsupported flavor
    with pytest.raises(ValueError):
        ds.partitioning(schema, flavor="unsupported")
Beispiel #3
0
    def to_pyarrow_dataset(
        self,
        partitions: Optional[List[Tuple[str, str, Any]]] = None,
        filesystem: Optional[FileSystem] = None,
    ) -> pyarrow.dataset.Dataset:
        """
        Build a PyArrow Dataset using data from the DeltaTable.

        :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax
        :param filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem
        :return: the PyArrow dataset in PyArrow
        """
        if not partitions:
            file_paths = self._table.file_uris()
        else:
            file_paths = self._table.files_by_partitions(partitions)
        paths = [urlparse(curr_file) for curr_file in file_paths]

        empty_delta_table = len(paths) == 0
        if empty_delta_table:
            return dataset(
                [],
                schema=self.pyarrow_schema(),
                partitioning=partitioning(flavor="hive"),
            )

        # Decide based on the first file, if the file is on cloud storage or local
        if paths[0].netloc:
            query_str = ""
            # pyarrow doesn't properly support the AWS_ENDPOINT_URL environment variable
            # for non-AWS S3 like resources. This is a slight hack until such a
            # point when pyarrow learns about AWS_ENDPOINT_URL
            endpoint_url = os.environ.get("AWS_ENDPOINT_URL")
            if endpoint_url:
                endpoint = urlparse(endpoint_url)
                # This format specific to the URL schema inference done inside
                # of pyarrow, consult their tests/dataset.py for examples
                query_str += (
                    f"?scheme={endpoint.scheme}&endpoint_override={endpoint.netloc}"
                )
            if not filesystem:
                filesystem = f"{paths[0].scheme}://{paths[0].netloc}{query_str}"

            keys = [curr_file.path for curr_file in paths]
            return dataset(
                keys,
                schema=self.pyarrow_schema(),
                filesystem=filesystem,
                partitioning=partitioning(flavor="hive"),
            )
        else:
            return dataset(
                file_paths,
                schema=self.pyarrow_schema(),
                format="parquet",
                filesystem=filesystem,
                partitioning=partitioning(flavor="hive"),
            )
Beispiel #4
0
    def to_pyarrow_dataset(
            self,
            partitions: Optional[List[Tuple]] = None
    ) -> pyarrow.dataset.Dataset:
        """
        Build a PyArrow Dataset using data from the DeltaTable.

        :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax
        :return: the PyArrow dataset in PyArrow
        """
        if partitions is None:
            file_paths = self._table.file_paths()
        else:
            table_path = self._table.table_path()
            file_paths = [
                f"{table_path}/{file_name}"
                for file_name in self._table.files_by_partitions(partitions)
            ]
        paths = [urlparse(curr_file) for curr_file in file_paths]

        # Decide based on the first file, if the file is on cloud storage or local
        if paths[0].netloc:
            query_str = ""
            # pyarrow doesn't properly support the AWS_ENDPOINT_URL environment variable
            # for non-AWS S3 like resources. This is a slight hack until such a
            # point when pyarrow learns about AWS_ENDPOINT_URL
            endpoint_url = os.environ.get("AWS_ENDPOINT_URL")
            if endpoint_url is not None:
                endpoint = urlparse(endpoint_url)
                # This format specific to the URL schema inference done inside
                # of pyarrow, consult their tests/dataset.py for examples
                query_str += (
                    f"?scheme={endpoint.scheme}&endpoint_override={endpoint.netloc}"
                )

            keys = [curr_file.path for curr_file in paths]
            return dataset(
                keys,
                schema=self.pyarrow_schema(),
                filesystem=f"{paths[0].scheme}://{paths[0].netloc}{query_str}",
                partitioning=partitioning(flavor="hive"),
            )
        else:
            return dataset(
                file_paths,
                schema=self.pyarrow_schema(),
                format="parquet",
                partitioning=partitioning(flavor="hive"),
            )
Beispiel #5
0
    def from_path(cls,
                  name: str,
                  path: str,
                  trial,
                  sinks=None,
                  partitioning=None) -> 'ClientFunctionModel':
        """
        Create a command line interface client for a meillionen model

        :param name: name of model. used for storage
        :param path: local path to the model. supports finding models
        accessible the PATH environment variable
        :return: A client to call a meillionen model
        """
        if sinks is None:
            sinks = {}
        if partitioning is None:
            partitioning = dataset.partitioning(pa.schema([]))

        response = client_create_interface_from_cli(path)
        interface = FuncInterfaceClient.from_recordbatch(name=name,
                                                         recordbatch=response)
        sinks = cls.infer_sinks(interface, sinks)
        return cls(interface=interface,
                   path=path,
                   sinks=sinks,
                   trial=trial,
                   partitioning=partitioning)
def test_with_partition_pruning():
    if skip:
        return
    filter_expression = ((ds.field('tip_amount') > 10) &
                         (ds.field('payment_type') > 2) &
                         (ds.field('VendorID') > 1))
    projection_cols = ['payment_type', 'tip_amount', 'VendorID']
    partitioning = ds.partitioning(pa.schema([("payment_type", pa.int32()),
                                              ("VendorID", pa.int32())]),
                                   flavor="hive")

    rados_parquet_dataset = ds.dataset(
        "file:///mnt/cephfs/nyc/",
        partitioning=partitioning,
        format=ds.RadosParquetFileFormat("/etc/ceph/ceph.conf"))
    parquet_dataset = ds.dataset("file:///mnt/cephfs/nyc/",
                                 partitioning=partitioning,
                                 format="parquet")

    rados_parquet_df = rados_parquet_dataset.to_table(
        columns=projection_cols, filter=filter_expression).to_pandas()

    parquet_df = parquet_dataset.to_table(
        columns=projection_cols, filter=filter_expression).to_pandas()

    assert rados_parquet_df.equals(parquet_df) == 1
Beispiel #7
0
def test_fragments_implicit_cast(tempdir):
    # ARROW-8693
    import pyarrow.parquet as pq

    table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part'])
    path = str(tempdir / "test_parquet_dataset")
    pq.write_to_dataset(table, path, partition_cols=["part"])

    part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive")
    dataset = ds.dataset(path, format="parquet", partitioning=part)
    fragments = dataset.get_fragments(filter=ds.field("part") >= 2)
    assert len(list(fragments)) == 1
Beispiel #8
0
def write_parquet(
    fs: fsspec.AbstractFileSystem,
    path: str,
    df: pd.DataFrame,
    partition_cols: Optional[List[str]],
    schema: pa.Schema,
    **kwargs,
):
    """
    Write a single dataframe to parquet.
    """
    # Check partition values are valid before writing to parquet
    mappings = check_partition_columns(df=df, partition_columns=partition_cols)
    df = clean_partition_cols(df=df, mappings=mappings)

    # Dataframe -> pyarrow Table
    table = pa.Table.from_pandas(df, schema=schema)

    if "basename_template" not in kwargs and "ts_init" in df.columns:
        kwargs["basename_template"] = (
            f"{df['ts_init'].iloc[0]}-{df['ts_init'].iloc[-1]}" +
            "-{i}.parquet")

    # Write the actual file
    partitions = (ds.partitioning(
        schema=pa.schema(
            fields=[table.schema.field(c) for c in (partition_cols)]),
        flavor="hive",
    ) if partition_cols else None)
    ds.write_dataset(
        data=table,
        base_dir=path,
        filesystem=fs,
        partitioning=partitions,
        format="parquet",
        **kwargs,
    )
    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(table.schema,
                      f"{path}/_common_metadata",
                      version="2.0",
                      filesystem=fs)

    # Write out any partition columns we had to modify due to filesystem requirements
    if mappings:
        write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
Beispiel #9
0
 def partitioning(self) -> t.Optional[pds.Partitioning]:
     if self.partition_cols is None:
         return None
     _schema = self.config.schema
     if _schema is None:
         raise e.code.CodingError(
             msgs=[
                 f"Ideally by now this should be set by now if not "
                 f"available",
                 f"That is possible on first write where table schema not "
                 f"provided it is estimated while writing and stored in "
                 f"config file"
             ]
         )
     # noinspection PyUnresolvedReferences
     return pds.partitioning(
         _schema.empty_table().select(self.partition_cols).schema
     )
Beispiel #10
0
def write_deltalake(
    table_or_uri: Union[str, DeltaTable],
    data: Union[pd.DataFrame, pa.Table, pa.RecordBatch,
                Iterable[pa.RecordBatch], RecordBatchReader, ],
    schema: Optional[pa.Schema] = None,
    partition_by: Optional[List[str]] = None,
    filesystem: Optional[pa_fs.FileSystem] = None,
    mode: Literal["error", "append", "overwrite", "ignore"] = "error",
    file_options: Optional[ds.ParquetFileWriteOptions] = None,
    max_open_files: int = 1024,
    max_rows_per_file: int = 0,
    min_rows_per_group: int = 0,
    max_rows_per_group: int = 1048576,
    name: Optional[str] = None,
    description: Optional[str] = None,
    configuration: Optional[Mapping[str, Optional[str]]] = None,
) -> None:
    """Write to a Delta Lake table (Experimental)

    If the table does not already exist, it will be created.

    This function only supports protocol version 1 currently. If an attempting
    to write to an existing table with a higher min_writer_version, this
    function will throw DeltaTableProtocolError.

    Note that this function does NOT register this table in a data catalog.

    :param table_or_uri: URI of a table or a DeltaTable object.
    :param data: Data to write. If passing iterable, the schema must also be given.
    :param schema: Optional schema to write.
    :param partition_by: List of columns to partition the table by. Only required
        when creating a new table.
    :param filesystem: Optional filesystem to pass to PyArrow. If not provided will
        be inferred from uri.
    :param mode: How to handle existing data. Default is to error if table
        already exists. If 'append', will add new data. If 'overwrite', will
        replace table with new data. If 'ignore', will not write anything if
        table already exists.
    :param file_options: Optional write options for Parquet (ParquetFileWriteOptions).
        Can be provided with defaults using ParquetFileWriteOptions().make_write_options().
        Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533
        for the list of available options
    :param max_open_files: Limits the maximum number of
        files that can be left open while writing. If an attempt is made to open
        too many files then the least recently used file will be closed.
        If this setting is set too low you may end up fragmenting your
        data into many small files.
    :param max_rows_per_file: Maximum number of rows per file.
        If greater than 0 then this will limit how many rows are placed in any single file.
        Otherwise there will be no limit and one file will be created in each output directory
        unless files need to be closed to respect max_open_files
    :param min_rows_per_group: Minimum number of rows per group. When the value is set,
        the dataset writer will batch incoming data and only write the row groups to the disk
        when sufficient rows have accumulated.
    :param max_rows_per_group: Maximum number of rows per group.
        If the value is set, then the dataset writer may split up large incoming batches into multiple row groups.
        If this value is set, then min_rows_per_group should also be set.
    :param name: User-provided identifier for this table.
    :param description: User-provided description for this table.
    :param configuration: A map containing configuration options for the metadata action.
    """
    if isinstance(data, pd.DataFrame):
        data = pa.Table.from_pandas(data)

    if schema is None:
        if isinstance(data, RecordBatchReader):
            schema = data.schema
        elif isinstance(data, Iterable):
            raise ValueError("You must provide schema if data is Iterable")
        else:
            schema = data.schema

    if isinstance(table_or_uri, str):
        table = try_get_deltatable(table_or_uri)
        table_uri = table_or_uri
    else:
        table = table_or_uri
        table_uri = table_uri = table._table.table_uri()

    # TODO: Pass through filesystem once it is complete
    # if filesystem is None:
    #    filesystem = pa_fs.PyFileSystem(DeltaStorageHandler(table_uri))

    if table:  # already exists
        if mode == "error":
            raise AssertionError("DeltaTable already exists.")
        elif mode == "ignore":
            return

        current_version = table.version()

        if partition_by:
            assert partition_by == table.metadata().partition_columns

        if table.protocol().min_writer_version > 1:
            raise DeltaTableProtocolError(
                "This table's min_writer_version is "
                f"{table.protocol().min_writer_version}, "
                "but this method only supports version 1.")
    else:  # creating a new table
        current_version = -1

        # TODO: Don't allow writing to non-empty directory
        # Blocked on: Finish filesystem implementation in fs.py
        # assert len(filesystem.get_file_info(pa_fs.FileSelector(table_uri, allow_not_found=True))) == 0

    if partition_by:
        partition_schema = pa.schema(
            [schema.field(name) for name in partition_by])
        partitioning = ds.partitioning(partition_schema, flavor="hive")
    else:
        partitioning = None

    add_actions: List[AddAction] = []

    def visitor(written_file: Any) -> None:
        partition_values = get_partitions_from_path(table_uri,
                                                    written_file.path)
        stats = get_file_stats_from_metadata(written_file.metadata)

        add_actions.append(
            AddAction(
                written_file.path,
                written_file.metadata.serialized_size,
                partition_values,
                int(datetime.now().timestamp()),
                True,
                json.dumps(stats, cls=DeltaJSONEncoder),
            ))

    ds.write_dataset(
        data,
        base_dir=table_uri,
        basename_template=f"{current_version + 1}-{uuid.uuid4()}-{{i}}.parquet",
        format="parquet",
        partitioning=partitioning,
        # It will not accept a schema if using a RBR
        schema=schema if not isinstance(data, RecordBatchReader) else None,
        file_visitor=visitor,
        existing_data_behavior="overwrite_or_ignore",
        file_options=file_options,
        max_open_files=max_open_files,
        max_rows_per_file=max_rows_per_file,
        min_rows_per_group=min_rows_per_group,
        max_rows_per_group=max_rows_per_group,
    )

    if table is None:
        _write_new_deltalake(  # type: ignore[call-arg]
            table_uri,
            schema,
            add_actions,
            mode,
            partition_by or [],
            name,
            description,
            configuration,
        )
    else:
        table._table.create_write_transaction(
            add_actions,
            mode,
            partition_by or [],
        )
Beispiel #11
0
def write_parquet(
    fs: fsspec.AbstractFileSystem,
    path: str,
    df: pd.DataFrame,
    partition_cols: Optional[List[str]],
    schema: pa.Schema,
    **kwargs,
):
    """
    Write a single dataframe to parquet.
    """
    # Check partition values are valid before writing to parquet
    mappings = check_partition_columns(df=df, partition_columns=partition_cols)
    df = clean_partition_cols(df=df, mappings=mappings)

    # Dataframe -> pyarrow Table
    table = pa.Table.from_pandas(df, schema=schema)

    if "basename_template" not in kwargs and "ts_init" in df.columns:
        kwargs["basename_template"] = (
            f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet"
        )

    # Write the actual file
    partitions = (
        ds.partitioning(
            schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]),
            flavor="hive",
        )
        if partition_cols
        else None
    )
    if pa.__version__ >= "6.0.0":
        kwargs.update(existing_data_behavior="overwrite_or_ignore")
    files = set(fs.glob(f"{path}/**"))
    ds.write_dataset(
        data=table,
        base_dir=path,
        filesystem=fs,
        partitioning=partitions,
        format="parquet",
        **kwargs,
    )

    # Ensure data written by write_dataset is sorted
    new_files = set(fs.glob(f"{path}/**/*.parquet")) - files
    del df
    for fn in new_files:
        ndf = pd.read_parquet(fs.open(fn))
        # assert ndf.shape[0] == shape
        if "ts_init" in ndf.columns:
            ndf = ndf.sort_values("ts_init").reset_index(drop=True)
        pq.write_table(
            table=pa.Table.from_pandas(ndf),
            where=fn,
            filesystem=fs,
        )

    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs)

    # Write out any partition columns we had to modify due to filesystem requirements
    if mappings:
        write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
Beispiel #12
0
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq

format_ = ds.RadosParquetFileFormat("/etc/ceph/ceph.conf", "cephfs-data0")
partitioning_ = ds.partitioning(pa.schema([("payment_type", pa.int32()),
                                           ("VendorID", pa.int32())]),
                                flavor="hive")
dataset_ = ds.dataset("file:///mnt/cephfs/nyc",
                      partitioning=partitioning_,
                      format=format_)
print(
    dataset_.to_table(columns=['total_amount', 'DOLocationID', 'payment_type'],
                      filter=(ds.field('payment_type') > 2)).to_pandas())