def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( "%s doesn't respect num_partitions %s", self, partition_spec.num_partitions, ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) if (isinstance(output_df, PandasDataFrame) and output_df.schema != output_schema): output_df = PandasDataFrame(output_df.native, output_schema) assert_or_throw( output_df.schema == output_schema, lambda: f"map output {output_df.schema} " f"mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = self.pl_utils.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)
def union( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.union(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join(df1.as_pandas(), df2.as_pandas(), join_type=how, on=key_schema.names) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata)
def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not None or a list"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = df.as_pandas().fillna(mapping, inplace=False) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("INTERSECT ALL for NativeExecutionEngine")) assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.intersect(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def on_init(self, partition_no: int, df: DataFrame) -> None: s = self.transformer.partition_spec self.transformer._cursor = s.get_cursor( # type: ignore self.schema, partition_no ) df._metadata = self.metadata self.transformer.on_init(df)
def distinct( self, df: DataFrame, metadata: Any = None, ) -> DataFrame: d = self.pl_utils.drop_duplicates(df.as_pandas()) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("EXCEPT ALL for NativeExecutionEngine")) assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.except_df(df1.as_pandas(), df2.as_pandas(), unique=distinct) return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
def dropna( self, df: DataFrame, how: str = "any", thresh: int = None, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: d = df.as_pandas().dropna(axis=0, how=how, thresh=thresh, subset=subset, inplace=False) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def sample( self, df: DataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, seed: Optional[int] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (n is None and frac is not None) or (n is not None and frac is None), ValueError("one and only one of n and frac should be set"), ) d = df.as_pandas().sample(n=n, frac=frac, replace=replace, random_state=seed) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DaskDataFrame: df = self.to_df(df) if partition_spec.empty: return df if len(partition_spec.partition_by) > 0: return df p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore KEYWORD_CORECOUNT: lambda: 2, # TODO: remove this hard code }) if p > 0: return DaskDataFrame( df.native.repartition(npartitions=p), schema=df.schema, metadata=df.metadata, type_safe=False, ) return df
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = df.as_pandas() # Use presort over partition_spec.presort if possible if presort: presort = parse_presort_exp(presort) _presort: IndexedOrderedDict = presort or partition_spec.presort if len(_presort.keys()) > 0: d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ) if len(partition_spec.partition_by) == 0: d = d.head(n) else: d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata, pandas_df_wrapper=True)
def to_input_data(self, df: DataFrame) -> Iterable[Dict[str, Any]]: return df.as_dict_iterable()
def to_input_data(self, df: DataFrame) -> EmptyAwareIterable[List[Any]]: return make_empty_aware(df.as_array_iterable(type_safe=True))
def to_input_data(self, df: DataFrame) -> Iterable[List[Any]]: return df.as_array_iterable(type_safe=True)
def to_input_data(self, df: DataFrame) -> List[List[Any]]: return df.as_array(type_safe=True)
def count(self, df: DataFrame) -> int: if df.is_bounded: return df.count() else: return sum(1 for _ in df.as_array_iterable())
def t3(df1: DataFrame, df2: DataFrame, a) -> DataFrame: value = df1.count() + df2.count() + a return ArrayDataFrame([[value]], "a:int")
def __init__(self, df=None, schema=None, metadata=None): super().__init__(df=df, schema=schema, metadata=metadata) DataFrame.__init__(self, lambda: Schema(schema))
def to_input_data(self, df: DataFrame) -> Iterable[pd.DataFrame]: if not isinstance(df, LocalDataFrameIterableDataFrame): yield df.as_pandas() else: for sub in df.native: yield sub.as_pandas()
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") if how == "cross": d1 = df1.as_pandas() d2 = df2.as_pandas() d1["__cross_join_index__"] = 1 d2["__cross_join_index__"] = 1 d = d1.merge(d2, on=("__cross_join_index__")).drop( "__cross_join_index__", axis=1) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["semi", "leftsemi"]: d1 = df1.as_pandas() d2 = df2.as_pandas()[key_schema.names] d = d1.merge(d2, on=key_schema.names, how="inner") return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["anti", "leftanti"]: d1 = df1.as_pandas() d2 = df2.as_pandas()[key_schema.names] d2["__anti_join_dummy__"] = 1.0 d = d1.merge(d2, on=key_schema.names, how="left") d = d[d.iloc[:, -1].isnull()] return PandasDataFrame( d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True), output_schema, metadata, ) fix_left, fix_right = False, False if how in ["leftouter"]: how = "left" self._validate_outer_joinable(df2.schema, key_schema) fix_right = True if how in ["rightouter"]: how = "right" self._validate_outer_joinable(df1.schema, key_schema) fix_left = True if how in ["fullouter"]: how = "outer" self._validate_outer_joinable(df1.schema, key_schema) self._validate_outer_joinable(df2.schema, key_schema) fix_left, fix_right = True, True d1 = df1.as_pandas() d2 = df2.as_pandas() d = d1.merge(d2, on=key_schema.names, how=how) if fix_left: d = self._fix_nan( d, output_schema, df1.schema.exclude(list(df2.schema.keys())).keys()) if fix_right: d = self._fix_nan( d, output_schema, df2.schema.exclude(list(df1.schema.keys())).keys()) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata)
def my_show(df: DataFrame) -> DataFrame: df.show() return df
def to_input_data(self, df: DataFrame) -> EmptyAwareIterable[Dict[str, Any]]: return make_empty_aware(df.as_dict_iterable())
def to_input_data(self, df: DataFrame) -> pd.DataFrame: return df.as_pandas()
def _persist_and_count(df: DataFrame) -> int: df = self.persist(df) return df.count()
def f25(e: DataFrame, a: LocalDataFrame) -> List[Dict[str, Any]]: e = e.as_array() e += list(a.as_array()) return list(ArrayDataFrame(e, "a:int").as_dict_iterable())
def t3(df1: DataFrame, df2: DataFrame, a, b) -> None: b.value = df1.count() + df2.count() + a