Ejemplo n.º 1
0
 def as_array_iterable(self,
                       columns: Optional[List[str]] = None,
                       type_safe: bool = False) -> Iterable[Any]:
     return DASK_UTILS.as_array_iterable(
         self.native,
         schema=self.schema.pa_schema,
         columns=columns,
         type_safe=type_safe,
     )
Ejemplo n.º 2
0
 def _apply_schema(
     self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True
 ) -> Tuple[pd.DataFrame, Schema]:
     if not type_safe:
         assert_arg_not_none(pdf, "pdf")
         assert_arg_not_none(schema, "schema")
         return pdf, schema
     DASK_UTILS.ensure_compatible(pdf)
     if pdf.columns.dtype == "object":  # pdf has named schema
         pschema = Schema(DASK_UTILS.to_schema(pdf))
         if schema is None or pschema == schema:
             return pdf, pschema.assert_not_empty()
         pdf = pdf[schema.assert_not_empty().names]
     else:  # pdf has no named schema
         schema = _input_schema(schema).assert_not_empty()
         assert_or_throw(
             pdf.shape[1] == len(schema),
             ValueError(f"Pandas datafame column count doesn't match {schema}"),
         )
         pdf.columns = schema.names
     return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
Ejemplo n.º 3
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (None if on_init is None else RunOnce(
            on_init,
            lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))))

        def _map(pdf: Any) -> pd.DataFrame:
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       input_schema,
                                       pandas_df_wrapper=True)
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)
        if len(partition_spec.partition_by) == 0:
            pdf = self.repartition(df, partition_spec)
            result = pdf.native.map_partitions(_map,
                                               meta=output_schema.pandas_dtype)
        else:
            df = self.repartition(
                df, PartitionSpec(num=partition_spec.num_partitions))
            result = DASK_UTILS.safe_groupby_apply(
                df.native,
                partition_spec.partition_by,
                _map,
                meta=output_schema.pandas_dtype,
            )
        return DaskDataFrame(result, output_schema, metadata)
Ejemplo n.º 4
0
 def empty(self) -> bool:
     return DASK_UTILS.empty(self.native)
Ejemplo n.º 5
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     if how == "cross":
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native
         d1["__cross_join_index__"] = 1
         d2["__cross_join_index__"] = 1
         d = d1.merge(d2, on=("__cross_join_index__")).drop(
             "__cross_join_index__", axis=1)
         return DaskDataFrame(d.reset_index(drop=True), output_schema,
                              metadata)
     if how in ["semi", "leftsemi"]:
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native[key_schema.names]
         d = d1.merge(d2, on=key_schema.names, how="inner")
         return DaskDataFrame(d.reset_index(drop=True), output_schema,
                              metadata)
     if how in ["anti", "leftanti"]:
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native[key_schema.names]
         if DASK_UTILS.empty(d1) or DASK_UTILS.empty(d2):
             return df1
         d2["__anti_join_dummy__"] = 1.0
         d = d1.merge(d2, on=key_schema.names, how="left")
         d = d[d["__anti_join_dummy__"].isnull()]
         return DaskDataFrame(
             d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True),
             output_schema,
             metadata,
         )
     fix_left, fix_right = False, False
     if how in ["leftouter"]:
         how = "left"
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_right = True
     if how in ["rightouter"]:
         how = "right"
         self._validate_outer_joinable(df1.schema, key_schema)
         fix_left = True
     if how in ["fullouter"]:
         how = "outer"
         self._validate_outer_joinable(df1.schema, key_schema)
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_left, fix_right = True, True
     d1 = self.to_df(df1).native
     d2 = self.to_df(df2).native
     d = d1.merge(d2, on=key_schema.names, how=how)
     if fix_left:
         d = self._fix_nan(
             d, output_schema,
             df1.schema.exclude(list(df2.schema.keys())).keys())
     if fix_right:
         d = self._fix_nan(
             d, output_schema,
             df2.schema.exclude(list(df1.schema.keys())).keys())
     return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata)