Esempio n. 1
0
def test_filter_df_from_predicates_or_predicates():
    df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)})

    predicates = [[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "non-existent")]]
    actual = filter_df_from_predicates(df, predicates)
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 6, 7, 8, 9],
            "B": ["A", "B", "A", "A", "B", "A", "B"],
            "C": [-10, -9, -8, -4, -3, -2, -1],
        },
        index=[0, 1, 2, 6, 7, 8, 9],
    )
    pdt.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "B")]]
    actual = filter_df_from_predicates(df, predicates)
    # row for (A == 4) is filtered out
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 3, 5, 6, 7, 8, 9],
            "B": ["A", "B", "A", "B", "B", "A", "B", "A", "B"],
            "C": [-10, -9, -8, -7, -5, -4, -3, -2, -1],
        },
        index=[0, 1, 2, 3, 5, 6, 7, 8, 9],
    )
    pdt.assert_frame_equal(actual, expected)
Esempio n. 2
0
    def _apply_partition_key_predicates(self, indices, split_predicates):
        """
        Apply the predicates to the partition_key columns and return the remaining
        predicates that should be pushed to the DataFrame serialiser.
        """
        # Construct a single line DF with the partition columns
        schema = self.schema
        index_df_dct = {}
        for column, value in indices:
            pa_dtype = schema[schema.get_field_index(column)].type
            value = IndexBase.normalize_value(pa_dtype, value)
            if pa.types.is_date(pa_dtype):
                index_df_dct[column] = pd.Series(
                    pd.to_datetime([value],
                                   infer_datetime_format=True)).dt.date
            else:
                dtype = pa_dtype.to_pandas_dtype()
                index_df_dct[column] = pd.Series([value], dtype=dtype)
        index_df = pd.DataFrame(index_df_dct)

        filtered_predicates = []
        # We assume that indices on the partition level have been filtered out already in `dispatch_metapartitions`.
        # `filtered_predicates` should only contain predicates that can be evaluated on parquet level
        for conjunction in split_predicates:
            predicates = [conjunction.key_part]
            if (len(conjunction.key_part) == 0 or len(
                    filter_df_from_predicates(
                        index_df, predicates, strict_date_types=True)) > 0):
                if len(conjunction.content_part) > 0:
                    filtered_predicates.append(conjunction.content_part)
                else:
                    # A condititon applies to the whole DataFrame, so we need to
                    # load all data.
                    return None
        return filtered_predicates
Esempio n. 3
0
def test_filter_df_from_predicates_empty_in(value):
    df = pd.DataFrame({"A": [value]})
    df["B"] = range(len(df))

    predicates = [[("A", "in", [])]]
    actual = filter_df_from_predicates(df, predicates)
    expected = df.iloc[[]]
    pdt.assert_frame_equal(actual, expected, check_categorical=False)
Esempio n. 4
0
def test_filter_df_from_predicates_bool(op, col):
    df = pd.DataFrame(
        {"A": [True, False] * 5, "B": [True, False, None, True, False] * 2}
    )

    value = True
    predicates = [[(col, op, value)]]
    actual = filter_df_from_predicates(df, predicates)
    if pd.api.types.is_categorical(df[col]):
        df[col] = df[col].astype(df[col].cat.as_ordered().dtype)
    expected = eval(f"df[df[col] {op} value]")
    pdt.assert_frame_equal(actual, expected, check_categorical=False)
Esempio n. 5
0
def test_filter_df_from_predicates(op, data, value):
    df = pd.DataFrame({"A": data})
    df["B"] = range(len(df))

    predicates = [[("A", op, value)]]
    actual = filter_df_from_predicates(df, predicates)
    if pd.api.types.is_categorical(df["A"]):
        df["A"] = df["A"].astype(df["A"].cat.as_ordered().dtype)
    if isinstance(value, datetime.date) and (df["A"].dtype
                                             == "datetime64[ns]"):
        # silence pandas warning
        value = pd.Timestamp(value)
    expected = eval(f"df[df['A'] {op} value]")
    pdt.assert_frame_equal(actual, expected, check_categorical=False)
Esempio n. 6
0
def test_filter_df_from_predicates(op, col):
    df = pd.DataFrame(
        {
            "A": range(10),
            "B": ["A", "B"] * 5,
            "C": pd.Series(["X", "Y"] * 5).astype("category"),
            "D": pd.Series([datetime.date(2019, 1, 1), datetime.date(2019, 1, 2)] * 5),
            "E": [datetime.datetime(2019, 1, 1), datetime.datetime(2019, 1, 2)] * 5,
        }
    )

    ix = 4
    value = df[col][ix]
    predicates = [[(col, op, value)]]
    actual = filter_df_from_predicates(df, predicates)
    if pd.api.types.is_categorical(df[col]):
        df[col] = df[col].astype(df[col].cat.as_ordered().dtype)
    expected = eval(f"df[df[col] {op} value]")
    pdt.assert_frame_equal(actual, expected, check_categorical=False)
Esempio n. 7
0
    def filter_df(self, df):
        """
        Filter given DataFrame w/ conjunction.

        NULL-values will always treated as non-matching.

        Parameters
        ----------
        df: pandas.DataFrame
            DataFrame to evaluate on, must contain required column.

        Returns
        -------
        result: pandas.DataFrame
            Part of the DataFrame for which the conjunction holds.
        """
        df = df.loc[df[list(self.columns)].notnull().all(axis=1)]

        predicate = self.predicate
        if predicate is None:
            # kartothek does not support empty predicate lists
            return df
        else:
            return filter_df_from_predicates(df, [self.predicate])
Esempio n. 8
0
    def as_flat_series(
        self,
        compact: bool = False,
        partitions_as_index: bool = False,
        date_as_object: bool = False,
        predicates: PredicatesType = None,
    ):
        """
        Convert the Index object to a pandas.Series

        Parameters
        ----------
        compact:
            If True, ensures that the index will be unique. If there a multiple partition values per index, there values
            will be compacted into a list (see Examples section).
        partitions_as_index:
            If True, the relation between index values and partitions will be reverted for the output dataframe:
            partition values will be used as index and the indices will be mapped to the partitions.
        predicates:
            A list of predicates. If a literal within the provided predicates
            references a column which is not part of this index, this literal is
            interpreted as True.

        Examples:

        .. code::
        >>> index1 = ExplicitSecondaryIndex(
        ...     column="col", index_dct={1: ["part_1", "part_2"]}, dtype=pa.int64()
        ... )
        >>> index1
            col
            1    part_1
            1    part_2
        >>> index1.as_flat_series(compact=True)
            col
            1    [part_1, part_2]
        >>> index1.as_flat_series(partitions_as_index=True)
            partition
            part_1    1
            part_2    1

        """
        check_predicates(predicates)
        table = _index_dct_to_table(self.index_dct,
                                    column=self.column,
                                    dtype=self.dtype)
        df = table.to_pandas(date_as_object=date_as_object)

        if predicates is not None:
            # If there is a conjunction without any reference to the index
            # column the entire predicates expression is evaluated to True. In
            # this case we do not need to filter the dataframe anymore
            for conjunction in predicates:
                new_conjunction = filter_predicates_by_column([conjunction],
                                                              [self.column])
                if new_conjunction is None:
                    break
            else:
                filtered_predicates = filter_predicates_by_column(
                    predicates, [self.column])
                df = filter_df_from_predicates(df,
                                               predicates=filtered_predicates)

        result_column = _PARTITION_COLUMN_NAME
        # This is the way the dictionary is directly translated
        # value: [partition]
        if compact and not partitions_as_index:
            return df.set_index(self.column)[result_column]

        # In all other circumstances we need a flat series first
        # value: part_1
        # value: part_2
        # value2: part_1
        if partitions_as_index or not compact:
            if len(df) == 0:
                keys = np.array([],
                                dtype=df[_PARTITION_COLUMN_NAME].values.dtype)
            else:
                keys = np.concatenate(df[_PARTITION_COLUMN_NAME].values)

            lengths = df[_PARTITION_COLUMN_NAME].apply(len).values
            lengths = lengths.astype(int)
            values_index = np.repeat(np.arange(len(df)), lengths)
            values = df[self.column].values[values_index]

            df = pd.DataFrame({
                _PARTITION_COLUMN_NAME: keys,
                self.column: values
            })

        # if it is not inverted and not compact, we're done
        if partitions_as_index:
            result_index = _PARTITION_COLUMN_NAME
            if compact:
                df = df.groupby(
                    df[result_index]).apply(lambda x: x[self.column].tolist())
                df.name = self.column
            else:
                df = df.set_index(result_index)[self.column]
        else:
            df = df.set_index(self.column)[_PARTITION_COLUMN_NAME]
        return df