Example #1
0
    def append(self, obj: Any) -> "Schema":  # noqa: C901
        """Append schema like object to the current schema. Only new columns
        are allowed.

        :raises SchemaError: if a column exists or is invalid or obj is not convertible
        :return: the Schema object itself
        """
        try:
            if obj is None:
                return self
            elif isinstance(obj, pa.Field):
                self[obj.name] = obj.type
            elif isinstance(obj, str):
                self._append_pa_schema(expression_to_schema(obj))
            elif isinstance(obj, Dict):
                for k, v in obj.items():
                    self[k] = v
            elif isinstance(obj, pa.Schema):
                self._append_pa_schema(obj)
            elif isinstance(obj, pd.DataFrame):
                self._append_pa_schema(PD_UTILS.to_schema(obj))
            elif isinstance(obj, Tuple):  # type: ignore
                self[obj[0]] = obj[1]
            elif isinstance(obj, List):
                for x in obj:
                    self.append(x)
            else:
                raise SchemaError(f"Invalid schema to add {obj}")
            return self
        except SchemaError:
            raise
        except Exception as e:
            raise SchemaError(str(e))
Example #2
0
 def as_array(self, cols=None, type_safe=False, null_schema=False):
     schema = None if null_schema else self.schema
     return list(
         PD_UTILS.as_array_iterable(self.native,
                                    schema=schema,
                                    columns=cols,
                                    type_safe=type_safe))
Example #3
0
 def _apply_schema(self, pdf: pd.DataFrame,
                   schema: Optional[Schema]) -> Tuple[pd.DataFrame, Schema]:
     PD_UTILS.ensure_compatible(pdf)
     if pdf.columns.dtype == "object":  # pdf has named schema
         pschema = _input_schema(pdf)
         if schema is None or pschema == schema:
             return pdf, pschema.assert_not_empty()
         pdf = pdf[schema.assert_not_empty().names]
     else:  # pdf has no named schema
         schema = _input_schema(schema).assert_not_empty()
         assert_or_throw(
             pdf.shape[1] == len(schema),
             ValueError(
                 f"Pandas datafame column count doesn't match {schema}"),
         )
         pdf.columns = schema.names
     return _enforce_type(pdf, schema), schema
Example #4
0
def test_safe_group_by_apply():
    df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True)

    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist()

    res = PD_UTILS.safe_groupby_apply(df.native, [], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist()

    df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]],
            "b:double,c:str", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    assert [
        [1.0, "a", 2],
        [1.0, "b", 2],
        [float("nan"), "c", 2],
        [float("nan"), "d", 2],
    ].__repr__() == res.values.tolist().__repr__()
 def as_array_iterable(self,
                       columns: Optional[List[str]] = None,
                       type_safe: bool = False) -> Iterable[Any]:
     for row in PD_UTILS.as_array_iterable(
             self.native,
             schema=self.schema.pa_schema,
             columns=columns,
             type_safe=type_safe,
     ):
         yield row
 def __init__(  # noqa: C901
     self,
     df: Any = None,
     schema: Any = None,
     metadata: Any = None,
     pandas_df_wrapper: bool = False,
 ):
     try:
         apply_schema = True
         if df is None:
             schema = _input_schema(schema).assert_not_empty()
             df = []
         if isinstance(df, PandasDataFrame):
             # TODO: This is useless if in this way and wrong
             pdf = df.native
             schema = None
         elif isinstance(df, (pd.DataFrame, pd.Series)):
             if isinstance(df, pd.Series):
                 df = df.to_frame()
             pdf = df
             schema = None if schema is None else _input_schema(schema)
             if pandas_df_wrapper and schema is not None:
                 apply_schema = False
         elif isinstance(df, Iterable):
             schema = _input_schema(schema).assert_not_empty()
             pdf = pd.DataFrame(df, columns=schema.names)
             pdf = PD_UTILS.enforce_type(pdf,
                                         schema.pa_schema,
                                         null_safe=True)
             if PD_UTILS.empty(pdf):
                 for k, v in schema.items():
                     pdf[k] = pdf[k].astype(v.type.to_pandas_dtype())
             apply_schema = False
         else:
             raise ValueError(f"{df} is incompatible with PandasDataFrame")
         if apply_schema:
             pdf, schema = self._apply_schema(pdf, schema)
         super().__init__(schema, metadata)
         self._native = pdf
     except Exception as e:
         raise FugueDataFrameInitError from e
Example #7
0
def test_fillna_default():
    df = pd.DataFrame([["a"], [None]], columns=["x"])
    s = PD_UTILS.fillna_default(df["x"])
    assert ["a", 0] == s.tolist()

    df = pd.DataFrame([["a"], ["b"]], columns=["x"])
    s = PD_UTILS.fillna_default(df["x"].astype(np.str))
    assert ["a", "b"] == s.tolist()

    dt = datetime.now()
    df = pd.DataFrame([[dt], [None]], columns=["x"])
    s = PD_UTILS.fillna_default(df["x"])
    assert [dt, _DEFAULT_DATETIME] == s.tolist()

    df = pd.DataFrame([[True], [None]], columns=["x"])
    s = PD_UTILS.fillna_default(df["x"])
    assert [True, 0] == s.tolist()

    df = pd.DataFrame([[True], [False]], columns=["x"])
    s = PD_UTILS.fillna_default(df["x"].astype(bool))
    assert [True, False] == s.tolist()
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                f"{self} doesn't respect num_partitions {partition_spec.num_partitions}"
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            assert_or_throw(
                output_df.schema == output_schema,
                f"map output {output_df.schema} mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = PD_UTILS.safe_groupby_apply(df.as_pandas(),
                                             partition_spec.partition_by, _map)
        return PandasDataFrame(result, output_schema, metadata)
Example #9
0
 def as_pandas(self) -> pd.DataFrame:
     """Convert to pandas DataFrame"""
     pdf = pd.DataFrame(self.as_array(), columns=self.schema.names)
     return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
Example #10
0
    def to_df(self,
              df: Any,
              schema: Any = None,
              metadata: Any = None) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError(
                    "schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if isinstance(df, ArrowDataFrame):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if isinstance(df, (ArrayDataFrame, IterableDataFrame)):
                adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema)
                sdf = self.spark_session.createDataFrame(
                    adf.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema))
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema))
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata)
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df,
                                                     to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            if PD_UTILS.empty(df):
                temp_schema = to_spark_schema(PD_UTILS.to_schema(df))
                sdf = self.spark_session.createDataFrame([], temp_schema)
            else:
                sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        assert_or_throw(schema is not None,
                        FugueDataFrameInitError("schema can't be None"))
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(adf.as_array(),
                                                 to_spark_schema(adf.schema))
        return SparkDataFrame(sdf, adf.schema, metadata)
Example #11
0
 def __init__(self, data, schema, enforce=False):
     s = expression_to_schema(schema)
     df = pd.DataFrame(data, columns=s.names)
     self.native = PD_UTILS.enforce_type(df, s, enforce)
     self.schema = s
Example #12
0
 def _m1(df):
     PD_UTILS.ensure_compatible(df)
     df["ct"] = df.shape[0]
     return df
Example #13
0
def test_safe_group_by_apply_special_types():
    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]],
            "a:str,b:double", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]],
        "a:str,b:double,ct:int",
        True,
    ).assert_eq(res)

    dt = datetime.now()
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]],
            "a:str,b:datetime", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:datetime,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)
Example #14
0
def test_to_schema():
    df = pd.DataFrame([[1.0, 2], [2.0, 3]])
    raises(ValueError, lambda: PD_UTILS.to_schema(df))
    df = pd.DataFrame([[1.0, 2], [2.0, 3]], columns=["x", "y"])
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df = pd.DataFrame([["a", 2], ["b", 3]], columns=["x", "y"])
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df = pd.DataFrame([], columns=["x", "y"])
    df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")})
    assert [pa.field("x", pa.int32()),
            pa.field("y", pa.string())] == list(PD_UTILS.to_schema(df))
    df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"])
    df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")})
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"])
    df = df.astype(dtype={"x": np.int32, "y": np.dtype(str)})
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"])
    df = df.astype(dtype={"x": np.int32, "y": np.dtype("str")})
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))

    # test index
    df = pd.DataFrame([[3.0, 2], [2.0, 3]], columns=["x", "y"])
    df = df.sort_values(["x"])
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df.index.name = "x"
    raises(ValueError, lambda: PD_UTILS.to_schema(df))
    df = df.reset_index(drop=True)
    assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df))
    df["p"] = "p"
    df = df.set_index(["p"])
    df.index.name = None
    raises(ValueError, lambda: PD_UTILS.to_schema(df))