Beispiel #1
0
    def _partition_data(self, partition_on):
        existing_indices, base_label = decode_key("uuid/table/{}".format(
            self.label))[2:]
        dct = dict()
        df = self.data

        # Check that data sizes do not change. This might happen if the
        # groupby below drops data, e.g. nulls
        size_after = 0
        size_before = len(df)

        # Implementation from pyarrow
        # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030  # noqa: E501

        # column sanity checks
        data_cols = set(df.columns).difference(partition_on)
        missing_po_cols = set(partition_on).difference(df.columns)
        if missing_po_cols:
            raise ValueError("Partition column(s) missing: {}".format(
                ", ".join(sorted(missing_po_cols))))
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        # To be aligned with open source tooling we drop the index columns and recreate
        # them upon reading as it is done by fastparquet and pyarrow
        partition_keys = [df[col] for col in partition_on]

        # # The handling of empty dfs is not part of the arrow implementation
        # if df.empty:
        #     return {}

        data_df = df.drop(partition_on, axis="columns")
        for value, group in data_df.groupby(by=partition_keys, sort=False):
            partitioning_info = []
            if pd.api.types.is_scalar(value):
                value = [value]
            if existing_indices:
                partitioning_info.extend(quote_indices(existing_indices))
            partitioning_info.extend(quote_indices(zip(partition_on, value)))
            partitioning_info.append(base_label)
            new_label = "/".join(partitioning_info)

            if new_label not in dct:
                dct[new_label] = {}
            dct[new_label] = group
            size_after += len(group)

        if size_before != size_after:
            raise ValueError(
                f"Original dataframe size ({size_before} rows) does not "
                f"match new dataframe size ({size_after} rows). "
                f"Hint: you may see this if you are trying to use `partition_on` on a column with null values."
            )

        return dct
Beispiel #2
0
def create_partition_key(
    dataset_uuid: str,
    table: str,
    index_values: List[Tuple[str, str]],
    filename: str = "data",
):
    """
    Create partition key for a kartothek partition

    Parameters
    ----------
    dataset_uuid
    table
    index_values
    filename

    Example:
        create_partition_key('my-uuid', 'testtable',
            [('index1', 'value1'), ('index2', 'value2')])

        returns 'my-uuid/testtable/index1=value1/index2=value2/data'
    """
    key_components = [dataset_uuid, table]
    index_path = quote_indices(index_values)
    key_components.extend(index_path)
    key_components.append(filename)
    key = "/".join(key_components)
    return key
Beispiel #3
0
def test_index_quote_roundtrip():
    indices = [
        (1, b"Muenchen"),
        ("location", b"Muenchen"),
        ("location", "München"),
        ("product", "å\\ øß"),
    ]
    expected = [
        ("1", "Muenchen"),
        ("location", "Muenchen"),
        ("location", "München"),
        ("product", "å\\ øß"),
    ]
    assert expected == unquote_indices(quote_indices(indices))
Beispiel #4
0
def _get_partition_label(indices, filename, metadata_version):
    return "/".join(
        quote_indices(indices) + [filename.replace(PARQUET_FILE_SUFFIX, "")])