def as_array_iterable(self, columns: Optional[List[str]] = None, type_safe: bool = False) -> Iterable[Any]: return DASK_UTILS.as_array_iterable( self.native, schema=self.schema.pa_schema, columns=columns, type_safe=type_safe, )
def _apply_schema( self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True ) -> Tuple[pd.DataFrame, Schema]: if not type_safe: assert_arg_not_none(pdf, "pdf") assert_arg_not_none(schema, "schema") return pdf, schema DASK_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = Schema(DASK_UTILS.to_schema(pdf)) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError(f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = (None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])))) def _map(pdf: Any) -> pd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) if len(partition_spec.partition_by) == 0: pdf = self.repartition(df, partition_spec) result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype) else: df = self.repartition( df, PartitionSpec(num=partition_spec.num_partitions)) result = DASK_UTILS.safe_groupby_apply( df.native, partition_spec.partition_by, _map, meta=output_schema.pandas_dtype, ) return DaskDataFrame(result, output_schema, metadata)
def empty(self) -> bool: return DASK_UTILS.empty(self.native)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") if how == "cross": d1 = self.to_df(df1).native d2 = self.to_df(df2).native d1["__cross_join_index__"] = 1 d2["__cross_join_index__"] = 1 d = d1.merge(d2, on=("__cross_join_index__")).drop( "__cross_join_index__", axis=1) return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["semi", "leftsemi"]: d1 = self.to_df(df1).native d2 = self.to_df(df2).native[key_schema.names] d = d1.merge(d2, on=key_schema.names, how="inner") return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["anti", "leftanti"]: d1 = self.to_df(df1).native d2 = self.to_df(df2).native[key_schema.names] if DASK_UTILS.empty(d1) or DASK_UTILS.empty(d2): return df1 d2["__anti_join_dummy__"] = 1.0 d = d1.merge(d2, on=key_schema.names, how="left") d = d[d["__anti_join_dummy__"].isnull()] return DaskDataFrame( d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True), output_schema, metadata, ) fix_left, fix_right = False, False if how in ["leftouter"]: how = "left" self._validate_outer_joinable(df2.schema, key_schema) fix_right = True if how in ["rightouter"]: how = "right" self._validate_outer_joinable(df1.schema, key_schema) fix_left = True if how in ["fullouter"]: how = "outer" self._validate_outer_joinable(df1.schema, key_schema) self._validate_outer_joinable(df2.schema, key_schema) fix_left, fix_right = True, True d1 = self.to_df(df1).native d2 = self.to_df(df2).native d = d1.merge(d2, on=key_schema.names, how=how) if fix_left: d = self._fix_nan( d, output_schema, df1.schema.exclude(list(df2.schema.keys())).keys()) if fix_right: d = self._fix_nan( d, output_schema, df2.schema.exclude(list(df1.schema.keys())).keys()) return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata)