Example #1
0
    def as_flat_series(
        self,
        compact: bool = False,
        partitions_as_index: bool = False,
        date_as_object: bool = False,
        predicates: PredicatesType = None,
    ):
        """
        Convert the Index object to a pandas.Series

        Parameters
        ----------
        compact:
            If True, ensures that the index will be unique. If there a multiple partition values per index, there values
            will be compacted into a list (see Examples section).
        partitions_as_index:
            If True, the relation between index values and partitions will be reverted for the output dataframe:
            partition values will be used as index and the indices will be mapped to the partitions.
        predicates:
            A list of predicates. If a literal within the provided predicates
            references a column which is not part of this index, this literal is
            interpreted as True.

        Examples:

        .. code::
        >>> index1 = ExplicitSecondaryIndex(
        ...     column="col", index_dct={1: ["part_1", "part_2"]}, dtype=pa.int64()
        ... )
        >>> index1
            col
            1    part_1
            1    part_2
        >>> index1.as_flat_series(compact=True)
            col
            1    [part_1, part_2]
        >>> index1.as_flat_series(partitions_as_index=True)
            partition
            part_1    1
            part_2    1

        """
        check_predicates(predicates)
        table = _index_dct_to_table(self.index_dct,
                                    column=self.column,
                                    dtype=self.dtype)
        df = table.to_pandas(date_as_object=date_as_object)

        if predicates is not None:
            # If there is a conjunction without any reference to the index
            # column the entire predicates expression is evaluated to True. In
            # this case we do not need to filter the dataframe anymore
            for conjunction in predicates:
                new_conjunction = filter_predicates_by_column([conjunction],
                                                              [self.column])
                if new_conjunction is None:
                    break
            else:
                filtered_predicates = filter_predicates_by_column(
                    predicates, [self.column])
                df = filter_df_from_predicates(df,
                                               predicates=filtered_predicates)

        result_column = _PARTITION_COLUMN_NAME
        # This is the way the dictionary is directly translated
        # value: [partition]
        if compact and not partitions_as_index:
            return df.set_index(self.column)[result_column]

        # In all other circumstances we need a flat series first
        # value: part_1
        # value: part_2
        # value2: part_1
        if partitions_as_index or not compact:
            if len(df) == 0:
                keys = np.array([],
                                dtype=df[_PARTITION_COLUMN_NAME].values.dtype)
            else:
                keys = np.concatenate(df[_PARTITION_COLUMN_NAME].values)

            lengths = df[_PARTITION_COLUMN_NAME].apply(len).values
            lengths = lengths.astype(int)
            values_index = np.repeat(np.arange(len(df)), lengths)
            values = df[self.column].values[values_index]

            df = pd.DataFrame({
                _PARTITION_COLUMN_NAME: keys,
                self.column: values
            })

        # if it is not inverted and not compact, we're done
        if partitions_as_index:
            result_index = _PARTITION_COLUMN_NAME
            if compact:
                df = df.groupby(
                    df[result_index]).apply(lambda x: x[self.column].tolist())
                df.name = self.column
            else:
                df = df.set_index(result_index)[self.column]
        else:
            df = df.set_index(self.column)[_PARTITION_COLUMN_NAME]
        return df
Example #2
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    label_filter: Optional[Callable] = None,
    concat_partitions_on_primary_index: bool = False,
    predicates: PredicatesType = None,
    store: Optional[StoreInput] = None,
    dispatch_by: Optional[List[str]] = None,
    dispatch_metadata: bool = False,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and concat_partitions_on_primary_index:
        raise ValueError(
            "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, "
            "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. "
            "Please only provide the `dispatch_by` argument. "
        )
    if concat_partitions_on_primary_index:
        dispatch_by = dataset_factory.partition_keys

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if label_filter:
        base_df = base_df[base_df.index.map(label_filter)]

    indices_to_dispatch = {
        name: ix.unload()
        for name, ix in dataset_factory.indices.items()
        if isinstance(ix, ExplicitSecondaryIndex)
    }

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        dataset_metadata=dataset_factory.metadata
                        if dispatch_metadata
                        else None,
                        indices=indices_to_dispatch if dispatch_metadata else None,
                        metadata_version=dataset_factory.metadata_version,
                        table_meta=dataset_factory.table_meta,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                dataset_metadata=dataset_factory.metadata
                if dispatch_metadata
                else None,
                indices=indices_to_dispatch if dispatch_metadata else None,
                metadata_version=dataset_factory.metadata_version,
                table_meta=dataset_factory.table_meta,
                partition_keys=dataset_factory.partition_keys,
            )
Example #3
0
def dispatch_metapartitions_from_factory(
    dataset_factory,
    label_filter=None,
    concat_partitions_on_primary_index=False,
    predicates=None,
    store=None,
    dispatch_by=None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    if not callable(dataset_factory) and not isinstance(
            dataset_factory, DatasetFactory):
        raise TypeError("Need to supply a dataset factory!")

    if dispatch_by and concat_partitions_on_primary_index:
        raise ValueError(
            "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, "
            "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. "
            "Please only provide the `dispatch_by` argument. ")
    if concat_partitions_on_primary_index:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. Use `dispatch_by=dataset_factory.partition_keys` to achieve the same behavior instead.",
            DeprecationWarning,
        )
        dispatch_by = dataset_factory.partition_keys

    if dispatch_by and not set(dispatch_by).issubset(
            set(dataset_factory.index_columns)):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(
            dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(list(index_cols),
                                                       predicates=predicates)

    if label_filter:
        base_df = base_df[base_df.index.map(label_filter)]

    indices_to_dispatch = {
        name: ix.unload()
        for name, ix in dataset_factory.indices.items()
        if isinstance(ix, ExplicitSecondaryIndex)
    }

    if dispatch_by:
        base_df = cast(pd.DataFrame, base_df)

        # Group the resulting MetaParitions by partition keys or a subset of those keys
        merged_partitions = base_df.groupby(by=list(dispatch_by),
                                            sort=False,
                                            as_index=False)
        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name, )
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name))
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        dataset_metadata=dataset_factory.metadata,
                        indices=indices_to_dispatch,
                        metadata_version=dataset_factory.metadata_version,
                        table_meta=dataset_factory.table_meta,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                    ))
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                dataset_metadata=dataset_factory.metadata,
                indices=indices_to_dispatch,
                metadata_version=dataset_factory.metadata_version,
                table_meta=dataset_factory.table_meta,
                partition_keys=dataset_factory.partition_keys,
            )
Example #4
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        metadata_version=dataset_factory.metadata_version,
                        schema=dataset_factory.schema,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                        table_name=dataset_factory.table_name,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                metadata_version=dataset_factory.metadata_version,
                schema=dataset_factory.schema,
                partition_keys=dataset_factory.partition_keys,
                table_name=dataset_factory.table_name,
            )