Esempio n. 1
0
def _get_partition_keys_from_partitions(partitions):
    if len(partitions):
        part = next(iter(partitions.values()))
        files_dct = part["files"]
        if files_dct:
            key = next(iter(files_dct.values()))
            _, _, indices, _ = decode_key(key)
            if indices:
                return [tup[0] for tup in indices]
    return None
Esempio n. 2
0
    def _partition_data(self, partition_on):
        existing_indices, base_label = decode_key("uuid/table/{}".format(
            self.label))[2:]
        dct = dict()
        df = self.data

        # Check that data sizes do not change. This might happen if the
        # groupby below drops data, e.g. nulls
        size_after = 0
        size_before = len(df)

        # Implementation from pyarrow
        # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030  # noqa: E501

        # column sanity checks
        data_cols = set(df.columns).difference(partition_on)
        missing_po_cols = set(partition_on).difference(df.columns)
        if missing_po_cols:
            raise ValueError("Partition column(s) missing: {}".format(
                ", ".join(sorted(missing_po_cols))))
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        # To be aligned with open source tooling we drop the index columns and recreate
        # them upon reading as it is done by fastparquet and pyarrow
        partition_keys = [df[col] for col in partition_on]

        # # The handling of empty dfs is not part of the arrow implementation
        # if df.empty:
        #     return {}

        data_df = df.drop(partition_on, axis="columns")
        for value, group in data_df.groupby(by=partition_keys, sort=False):
            partitioning_info = []
            if pd.api.types.is_scalar(value):
                value = [value]
            if existing_indices:
                partitioning_info.extend(quote_indices(existing_indices))
            partitioning_info.extend(quote_indices(zip(partition_on, value)))
            partitioning_info.append(base_label)
            new_label = "/".join(partitioning_info)

            if new_label not in dct:
                dct[new_label] = {}
            dct[new_label] = group
            size_after += len(group)

        if size_before != size_after:
            raise ValueError(
                f"Original dataframe size ({size_before} rows) does not "
                f"match new dataframe size ({size_after} rows). "
                f"Hint: you may see this if you are trying to use `partition_on` on a column with null values."
            )

        return dct
Esempio n. 3
0
def _load_partitions_from_filenames(store, storage_keys, metadata_version):
    partitions = defaultdict(_get_empty_partition)
    depth_indices = None
    for key in storage_keys:
        dataset_uuid, table, indices, file_ = decode_key(key)
        if file_ is not None and file_.endswith(PARQUET_FILE_SUFFIX):
            # valid key example:
            # <uuid>/<table>/<column_0>=<value_0>/.../<column_n>=<value_n>/part_label.parquet
            depth_indices = _check_index_depth(indices, depth_indices)
            partition_label = _get_partition_label(indices, file_, metadata_version)
            partitions[partition_label]["files"][table] = key
    return partitions
Esempio n. 4
0
def _construct_dynamic_index_from_partitions(
    partitions: Dict[str, Partition],
    table_meta: TableMetaType,
    default_dtype: pa.DataType,
    partition_keys: List[str],
) -> Dict[str, PartitionIndex]:
    if len(partitions) == 0:
        return _empty_partition_indices(partition_keys, table_meta, default_dtype)

    def _get_files(part):
        if isinstance(part, dict):
            return part["files"]
        else:
            return part.files

    # We exploit the fact that all tables are partitioned equally.
    first_partition = next(
        iter(partitions.values())
    )  # partitions is NOT empty here, see check above
    first_partition_files = _get_files(first_partition)
    if not first_partition_files:
        return _empty_partition_indices(partition_keys, table_meta, default_dtype)
    key_table = next(iter(first_partition_files.keys()))
    storage_keys = (
        (key, _get_files(part)[key_table]) for key, part in partitions.items()
    )

    _key_indices: Dict[str, Dict[str, Set[str]]] = defaultdict(_get_empty_index)
    depth_indices = None
    for partition_label, key in storage_keys:
        _, _, indices, file_ = decode_key(key)
        if (
            file_ is not None
            and key.endswith(PARQUET_FILE_SUFFIX)
            and not key.endswith(EXTERNAL_INDEX_SUFFIX)
        ):
            depth_indices = _check_index_depth(indices, depth_indices)
            for column, value in indices:
                _key_indices[column][value].add(partition_label)
    new_indices = {}
    for col, index_dct in _key_indices.items():
        arrow_type = _get_type_from_meta(table_meta, col, default_dtype)

        # convert defaultdicts into dicts
        new_indices[col] = PartitionIndex(
            column=col,
            index_dct={k1: list(v1) for k1, v1 in index_dct.items()},
            dtype=arrow_type,
        )
    return new_indices
Esempio n. 5
0
    def load_dataframes(
        self,
        store: KeyValueStore,
        columns: Optional[Sequence[str]] = None,
        predicate_pushdown_to_io: bool = True,
        categoricals: Optional[Sequence[str]] = None,
        dates_as_object: bool = True,
        predicates: PredicatesType = None,
    ) -> "MetaPartition":
        """
        Load the dataframes of the partitions from store into memory.

        Parameters
        ----------
        tables
            If a list is supplied, only the given tables of the partition are
            loaded. If the given table does not exist it is ignored.

            Examples

            .. code::

                >>> part = MetaPartition(
                ...     label='part_label'
                ...     files={
                ...         'core': 'core_key_in_store',
                ...         'helper': 'helper_key_in_store'
                ...     }
                ...  )
                >>> part.data
                    {}
                >>> part = part.load_dataframes(store, ['core'])
                >>> part.data
                    {
                        'core': pd.DataFrame()
                    }

        """

        if categoricals is None:
            categoricals = []
        if not dates_as_object:
            warnings.warn(
                "The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.",
                DeprecationWarning,
            )

        LOGGER.debug("Loading internal dataframes of %s", self.label)
        if not self.file:
            # This used to raise, but the specs do not require this, so simply do a no op
            LOGGER.debug("Partition %s is empty and has no data.", self.label)
            return self
        predicates = _combine_predicates(predicates, self.logical_conjunction)
        predicates = _predicates_to_named(predicates)

        dataset_uuid, _, indices, _ = decode_key(self.file)

        # In case the columns only refer to the partition indices, we need to load at least a single column to
        # determine the length of the required dataframe.
        table_columns_to_io = columns

        filtered_predicates = predicates

        self = self.load_schema(dataset_uuid=dataset_uuid, store=store)

        # Filter predicates that would apply to this partition and remove the partition columns
        if predicates:
            # Check if there are predicates that match to the partition columns.
            # For these we need to check if the partition columns already falsify
            # the conditition.
            #
            # We separate these predicates into their index and their Parquet part.
            (
                split_predicates,
                has_index_condition,
            ) = self._split_predicates_in_index_and_content(predicates)

            filtered_predicates = []
            if has_index_condition:
                filtered_predicates = self._apply_partition_key_predicates(
                    indices, split_predicates)
            else:
                filtered_predicates = [
                    pred.content_part for pred in split_predicates
                ]

        # Remove partition_keys from table_columns_to_io
        if self.partition_keys and table_columns_to_io is not None:
            keys_to_remove = set(
                self.partition_keys) & set(table_columns_to_io)
            # This is done to not change the ordering of the list
            table_columns_to_io = [
                c for c in table_columns_to_io if c not in keys_to_remove
            ]

        start = time.time()
        df = DataFrameSerializer.restore_dataframe(
            key=self.file,
            store=store,
            columns=table_columns_to_io,
            categories=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            predicates=filtered_predicates,
            date_as_object=dates_as_object,
        )
        LOGGER.debug("Loaded dataframe %s in %s seconds.", self.file,
                     time.time() - start)
        # Metadata version >=4 parse the index columns and add them back to the dataframe

        df = self._reconstruct_index_columns(
            df=df,
            key_indices=indices,
            columns=columns,
            categories=categoricals,
            date_as_object=dates_as_object,
        )

        df.columns = df.columns.map(ensure_string_type)
        if columns is not None:
            # TODO: When the write-path ensures that all partitions have the same column set, this check can be
            #       moved before `DataFrameSerializer.restore_dataframe`. At the position of the current check we
            #       may want to double check the columns of the loaded DF and raise an exception indicating an
            #       inconsistent dataset state instead.
            missing_cols = set(columns).difference(df.columns)
            if missing_cols:
                raise ValueError(
                    "Columns cannot be found in stored dataframe: {}".format(
                        ", ".join(sorted(missing_cols))))

            if list(df.columns) != columns:
                df = df.reindex(columns=columns, copy=False)

        return self.copy(data=df)