def _is_monotonic_decreasing(self) -> Series: window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) cond = SF.lit(True) has_not_null = SF.lit(True) for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) compare = MultiIndex._comparator_for_monotonic_increasing(data_type) # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over the all levels. has_not_null = has_not_null & scol.isNotNull() cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__)) cond = has_not_null & (prev.isNull() | cond) cond_name = verify_temp_column_name( self._internal.spark_frame.select(self._internal.index_spark_columns), "__is_monotonic_decreasing_cond__", ) sdf = self._internal.spark_frame.select( self._internal.index_spark_columns + [cond.alias(cond_name)] ) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=self._internal.index_fields, ) return first_series(DataFrame(internal))
def resolve(internal: InternalFrame, side: str) -> InternalFrame: rename = lambda col: "__{}_{}".format(side, col) internal = internal.resolved_copy sdf = internal.spark_frame sdf = internal.spark_frame.select( *[ scol_for(sdf, col).alias(rename(col)) for col in sdf.columns if col not in HIDDEN_COLUMNS ], *HIDDEN_COLUMNS ) return internal.copy( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names ], index_fields=[ field.copy(name=rename(field.name)) for field in internal.index_fields ], data_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names ], data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields], )
def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIndex": """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. Parameters ---------- other : Index or array-like Returns ------- intersection : MultiIndex Examples -------- >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) >>> midx1.intersection(midx2).sort_values() # doctest: +SKIP MultiIndex([('c', 'z')], ) """ if isinstance(other, Series) or not is_list_like(other): raise TypeError("other must be a MultiIndex or a list of tuples") elif isinstance(other, DataFrame): raise ValueError("Index data must be 1-dimensional") elif isinstance(other, MultiIndex): spark_frame_other = other.to_frame().to_spark() keep_name = self.names == other.names elif isinstance(other, Index): # Always returns an empty MultiIndex if `other` is Index. return cast(MultiIndex, self.to_frame().head(0).index) elif not all(isinstance(item, tuple) for item in other): raise TypeError("other must be a MultiIndex or a list of tuples") else: other = MultiIndex.from_tuples(list(other)) spark_frame_other = cast(MultiIndex, other).to_frame().to_spark() keep_name = True index_fields = self._index_fields_for_union_like(other, func_name="intersection") default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)] spark_frame_self = self.to_frame(name=default_name).to_spark() spark_frame_intersected = spark_frame_self.intersect(spark_frame_other) if keep_name: index_names = self._internal.index_names else: index_names = None internal = InternalFrame( spark_frame=spark_frame_intersected, index_spark_columns=[ scol_for(spark_frame_intersected, cast(str, col)) for col in default_name ], index_names=index_names, index_fields=index_fields, ) return cast(MultiIndex, DataFrame(internal).index)
def execute(self, index_col: Optional[Union[str, List[str]]]) -> DataFrame: """ Returns a DataFrame for which the SQL statement has been executed by the underlying SQL engine. >>> from pyspark.pandas import sql_processor >>> # we will call 'sql_processor' directly in doctests so decrease one level. >>> sql_processor._CAPTURE_SCOPES = 2 >>> sql = sql_processor.sql >>> str0 = 'abc' >>> sql("select {str0}") abc 0 abc >>> str1 = 'abc"abc' >>> str2 = "abc'abc" >>> sql("select {str0}, {str1}, {str2}") abc abc"abc abc'abc 0 abc abc"abc abc'abc >>> strs = ['a', 'b'] >>> sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2") cond1 cond2 0 True False """ blocks = _string.formatter_parser(self._statement) # TODO: use a string builder res = "" try: for (pre, inner, _, _) in blocks: var_next = "" if inner is None else self._convert(inner) res = res + pre + var_next self._normalized_statement = res sdf = self._session.sql(self._normalized_statement) finally: for v in self._temp_views: self._session.catalog.dropTempView(v) index_spark_columns, index_names = _get_index_map(sdf, index_col) return DataFrame( InternalFrame(spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names))
def sql( query: str, index_col: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> DataFrame: """ Execute a SQL query and return the result as a pandas-on-Spark DataFrame. This function acts as a standard Python string formatter with understanding the following variable types: * pandas-on-Spark DataFrame * pandas-on-Spark Series * pandas DataFrame * pandas Series * string Parameters ---------- query : str the SQL query index_col : str or list of str, optional Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default, the index is always lost. .. note:: If you want to preserve the index, explicitly use :func:`DataFrame.reset_index`, and pass it to the sql statement with `index_col` parameter. For example, >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c']) >>> new_psdf = psdf.reset_index() >>> ps.sql("SELECT * FROM {new_psdf}", index_col="index", new_psdf=new_psdf) ... # doctest: +NORMALIZE_WHITESPACE A B index a 1 4 b 2 5 c 3 6 For MultiIndex, >>> psdf = ps.DataFrame( ... {"A": [1, 2, 3], "B": [4, 5, 6]}, ... index=pd.MultiIndex.from_tuples( ... [("a", "b"), ("c", "d"), ("e", "f")], names=["index1", "index2"] ... ), ... ) >>> new_psdf = psdf.reset_index() >>> ps.sql( ... "SELECT * FROM {new_psdf}", index_col=["index1", "index2"], new_psdf=new_psdf) ... # doctest: +NORMALIZE_WHITESPACE A B index1 index2 a b 1 4 c d 2 5 e f 3 6 Also note that the index name(s) should be matched to the existing name. kwargs other variables that the user want to set that can be referenced in the query Returns ------- pandas-on-Spark DataFrame Examples -------- Calling a built-in SQL function. >>> ps.sql("SELECT * FROM range(10) where id > 7") id 0 8 1 9 >>> ps.sql("SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9) id 0 8 >>> mydf = ps.range(10) >>> x = tuple(range(4)) >>> ps.sql("SELECT {ser} FROM {mydf} WHERE id IN {x}", ser=mydf.id, mydf=mydf, x=x) id 0 0 1 1 2 2 3 3 Mixing pandas-on-Spark and pandas DataFrames in a join operation. Note that the index is dropped. >>> ps.sql(''' ... SELECT m1.a, m2.b ... FROM {table1} m1 INNER JOIN {table2} m2 ... ON m1.key = m2.key ... ORDER BY m1.a, m2.b''', ... table1=ps.DataFrame({"a": [1,2], "key": ["a", "b"]}), ... table2=pd.DataFrame({"b": [3,4,5], "key": ["a", "b", "b"]})) a b 0 1 3 1 2 4 2 2 5 Also, it is possible to query using Series. >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c']) >>> ps.sql("SELECT {mydf.A} FROM {mydf}", mydf=psdf) A 0 1 1 2 2 3 """ if os.environ.get("PYSPARK_PANDAS_SQL_LEGACY") == "1": from pyspark.pandas import sql_processor warnings.warn( "Deprecated in 3.3.0, and the legacy behavior " "will be removed in the future releases.", FutureWarning, ) return sql_processor.sql(query, index_col=index_col, **kwargs) session = default_session() formatter = PandasSQLStringFormatter(session) try: sdf = session.sql(formatter.format(query, **kwargs)) finally: formatter.clear() index_spark_columns, index_names = _get_index_map(sdf, index_col) return DataFrame( InternalFrame(spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names))
def attach_id_column(self, id_type: str, column: Name) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ps.DataFrame({"x": ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from pyspark.pandas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._psdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_fields=internal.data_fields + [ InternalField.from_struct_field( StructField(name_like_string(column), LongType(), nullable=False) ) ], column_label_names=internal.column_label_names, ).resolved_copy )
def transform_batch( self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any ) -> DataFrameOrSeries: """ Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. The length of each input and output should be the same. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)] * len(pdf)) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.transform_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a NumPy compound type style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to transform each pandas frame. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame or Series See Also -------- DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) c0 c1 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) # doctest: +NORMALIZE_WHITESPACE A B index 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.Series[int]: ... return pdf.B + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) 0 3 1 5 2 7 dtype: int64 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1) A B 0 2 3 1 4 5 2 6 7 >>> (df * -1).pandas_on_spark.transform_batch(abs) A B 0 1 2 1 3 4 2 5 6 Note that you should not transform the index. The index information will not change. >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1) 0 3 1 5 2 7 Name: B, dtype: int64 You can also specify extra arguments as below. >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3) 0 8 1 10 2 12 Name: B, dtype: int64 """ from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series from pyspark import pandas as ps assert callable(func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_retain_index = should_infer_schema original_func = func func = lambda o: original_func(o, *args, **kwargs) def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: return func(pdf).to_frame() def pandas_series_func( f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType ) -> "UserDefinedFunctionLike": ff = f @pandas_udf(returnType=return_type) # type: ignore[call-overload] def udf(pdf: pd.DataFrame) -> pd.Series: return first_series(ff(pdf)) return udf if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( "If the type hints is not specified for `transform_batch`, " "it is expensive to infer the data type internally." ) limit = ps.get_option("compute.shortcut_limit") pdf = self._psdf.head(limit + 1)._to_internal_pandas() transformed = func(pdf) if not isinstance(transformed, (pd.DataFrame, pd.Series)): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(transformed) ) if len(transformed) != len(pdf): raise ValueError("transform_batch cannot produce aggregated results") psdf_or_psser = ps.from_pandas(transformed) if isinstance(psdf_or_psser, ps.Series): psser = cast(ps.Series, psdf_or_psser) field = psser._internal.data_fields[0].normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns # TODO: Index will be lost in this case. internal = self._psdf._internal.copy( column_labels=psser._internal.column_labels, data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=psser._internal.column_label_names, ) return first_series(DataFrame(internal)) else: psdf = cast(DataFrame, psdf_or_psser) if len(pdf) <= limit: # only do the short cut when it returns a frame to avoid # operations on different dataframes in case of series. return psdf index_fields = [ field.normalize_spark_type() for field in psdf._internal.index_fields ] data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] return_schema = StructType( [field.struct_field for field in index_fields + data_fields] ) self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) return DataFrame( psdf._internal.with_new_sdf( spark_frame=sdf, index_fields=index_fields, data_fields=data_fields ) ) else: return_type = infer_return_type(original_func) is_return_series = isinstance(return_type, SeriesType) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe and not is_return_series: raise TypeError( "The given function should specify a frame or series as its type " "hints; however, the return type was %s." % return_sig ) if is_return_series: field = InternalField( dtype=cast(SeriesType, return_type).dtype, struct_field=StructField( name=SPARK_DEFAULT_SERIES_NAME, dataType=cast(SeriesType, return_type).spark_type, ), ).normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns internal = self._psdf._internal.copy( column_labels=[None], data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=None, ) return first_series(DataFrame(internal)) else: index_fields = cast(DataFrameType, return_type).index_fields index_fields = [index_field.normalize_spark_type() for index_field in index_fields] data_fields = [ field.normalize_spark_type() for field in cast(DataFrameType, return_type).data_fields ] normalized_fields = index_fields + data_fields return_schema = StructType([field.struct_field for field in normalized_fields]) should_retain_index = len(index_fields) > 0 self_applied = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=should_retain_index # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) index_spark_columns = None index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None if should_retain_index: index_spark_columns = [ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields ] if not any( [ SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name) for index_field in index_fields ] ): index_names = [ (index_field.struct_field.name,) for index_field in index_fields ] internal = InternalFrame( spark_frame=sdf, index_names=index_names, index_spark_columns=index_spark_columns, index_fields=index_fields, data_fields=data_fields, ) return DataFrame(internal)
def apply_batch( self, func: Callable[..., pd.DataFrame], args: Tuple = (), **kwds: Any ) -> "DataFrame": """ Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int, [int]]: ... return pd.DataFrame([len(pdf)]) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.apply_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... 10 83 11 83 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a NumPy compound type style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to apply to each pandas frame. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- DataFrame See Also -------- DataFrame.apply: For row/columnwise operations. DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def query_func(pdf) -> ps.DataFrame[int, [int, int]]: ... return pdf.query('A == 1') >>> df.pandas_on_spark.apply_batch(query_func) c0 c1 0 1 2 >>> def query_func(pdf) -> ps.DataFrame[("idx", int), [("A", int), ("B", int)]]: ... return pdf.query('A == 1') >>> df.pandas_on_spark.apply_batch(query_func) # doctest: +NORMALIZE_WHITESPACE A B idx 0 1 2 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.apply_batch(lambda pdf: pdf.query('A == 1')) A B 0 1 2 You can also specify extra arguments. >>> def calculation(pdf, y, z) -> ps.DataFrame[int, [int, int]]: ... return pdf ** y + z >>> df.pandas_on_spark.apply_batch(calculation, args=(10,), z=20) c0 c1 0 21 1044 1 59069 1048596 2 9765645 60466196 You can also use ``np.ufunc`` and built-in functions as input. >>> df.pandas_on_spark.apply_batch(np.add, args=(10,)) A B 0 11 12 1 13 14 2 15 16 >>> (df * -1).pandas_on_spark.apply_batch(abs) A B 0 1 2 1 3 4 2 5 6 """ # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate? from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark import pandas as ps if not isinstance(func, FunctionType): assert callable(func), "the first argument should be a callable function." f = func func = lambda *args, **kwargs: f(*args, **kwargs) spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None original_func = func func = lambda o: original_func(o, *args, **kwds) self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( "If the type hints is not specified for `apply_batch`, " "it is expensive to infer the data type internally." ) limit = ps.get_option("compute.shortcut_limit") pdf = self_applied.head(limit + 1)._to_internal_pandas() applied = func(pdf) if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(applied) ) psdf: DataFrame = DataFrame(applied) if len(pdf) <= limit: return psdf index_fields = [field.normalize_spark_type() for field in psdf._internal.index_fields] data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] return_schema = StructType([field.struct_field for field in index_fields + data_fields]) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True ) sdf = self_applied._internal.spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema ) # If schema is inferred, we can restore indexes too. internal = psdf._internal.with_new_sdf( spark_frame=sdf, index_fields=index_fields, data_fields=data_fields ) else: return_type = infer_return_type(original_func) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " "hints; however, the return type was %s." % return_sig ) index_fields = cast(DataFrameType, return_type).index_fields should_retain_index = len(index_fields) > 0 return_schema = cast(DataFrameType, return_type).spark_type output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=should_retain_index ) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema ) index_spark_columns = None index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None if should_retain_index: index_spark_columns = [ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields ] if not any( [ SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name) for index_field in index_fields ] ): index_names = [(index_field.struct_field.name,) for index_field in index_fields] internal = InternalFrame( spark_frame=sdf, index_names=index_names, index_spark_columns=index_spark_columns, index_fields=index_fields, data_fields=cast(DataFrameType, return_type).data_fields, ) return DataFrame(internal)
def drop(self, codes: List[Any], level: Optional[Union[int, Name]] = None) -> "MultiIndex": """ Make new MultiIndex with passed list of labels deleted Parameters ---------- codes : array-like Must be a list of tuples level : int or level name, default None Returns ------- dropped : MultiIndex Examples -------- >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> index # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) >>> index.drop(['a']) # doctest: +SKIP MultiIndex([('b', 'y'), ('c', 'z')], ) >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP MultiIndex([('c', 'z')], ) """ internal = self._internal.resolved_copy sdf = internal.spark_frame index_scols = internal.index_spark_columns if level is None: scol = index_scols[0] elif isinstance(level, int): scol = index_scols[level] else: scol = None for index_spark_column, index_name in zip( internal.index_spark_columns, internal.index_names): if not isinstance(level, tuple): level = (level, ) if level == index_name: if scol is not None: raise ValueError( "The name {} occurs multiple times, use a level number" .format(name_like_string(level))) scol = index_spark_column if scol is None: raise KeyError("Level {} not found".format( name_like_string(level))) sdf = sdf[~scol.isin(codes)] internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in internal.index_spark_column_names ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=[], data_spark_columns=[], data_fields=[], ) return cast(MultiIndex, DataFrame(internal).index)
def test_from_pandas(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [("a", ), ("b", )]) self.assert_eq(internal.data_spark_column_names, ["a", "b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("a", )), sdf["a"])) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # non-string column name pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf1) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [(0, ), (1, )]) self.assert_eq(internal.data_spark_column_names, ["0", "1"]) self.assertTrue( spark_column_equals(internal.spark_column_for((0, )), sdf["0"])) self.assertTrue( spark_column_equals(internal.spark_column_for((1, )), sdf["1"])) self.assert_eq(internal.to_pandas_frame, pdf1) # multi-index pdf.set_index("a", append=True, inplace=True) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("b", )]) self.assert_eq(internal.data_spark_column_names, ["b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("x", "b")]) self.assert_eq(internal.data_spark_column_names, ["(x, b)"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("x", "b")), sdf["(x, b)"])) self.assert_eq(internal.to_pandas_frame, pdf)
def from_frame(df: DataFrame, names: Optional[List[Name]] = None) -> "MultiIndex": """ Make a MultiIndex from a DataFrame. Parameters ---------- df : DataFrame DataFrame to be converted to MultiIndex. names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. Returns ------- MultiIndex The MultiIndex representation of the given DataFrame. See Also -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. Examples -------- >>> df = ps.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) >>> df # doctest: +SKIP a b 0 HI Temp 1 HI Precip 2 NJ Temp 3 NJ Precip >>> ps.MultiIndex.from_frame(df) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['a', 'b']) Using explicit names, instead of the column names >>> ps.MultiIndex.from_frame(df, names=['state', 'observation']) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['state', 'observation']) """ if not isinstance(df, DataFrame): raise TypeError("Input must be a DataFrame") sdf = df.to_spark() if names is None: names = df._internal.column_labels elif not is_list_like(names): raise TypeError("Names should be list-like for a MultiIndex") else: names = [ name if is_name_like_tuple(name) else (name, ) for name in names ] internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, col) for col in sdf.columns], index_names=names, ) return cast(MultiIndex, DataFrame(internal).index)
def insert(self, loc: int, item: Any) -> Index: """ Make new MultiIndex inserting new item at location. Follows Python list.append semantics for negative values. Parameters ---------- loc : int item : object Returns ------- new_index : MultiIndex Examples -------- >>> psmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> psmidx.insert(3, ("h", "j")) # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z'), ('h', 'j')], ) For negative values >>> psmidx.insert(-2, ("h", "j")) # doctest: +SKIP MultiIndex([('a', 'x'), ('h', 'j'), ('b', 'y'), ('c', 'z')], ) """ length = len(self) if loc < 0: loc = loc + length if loc < 0: raise IndexError( "index {} is out of bounds for axis 0 with size {}".format( (loc - length), length)) else: if loc > length: raise IndexError( "index {} is out of bounds for axis 0 with size {}".format( loc, length)) index_name = [ (name, ) for name in self._internal.index_spark_column_names ] # type: List[Label] sdf_before = self.to_frame(name=index_name)[:loc].to_spark() sdf_middle = Index([item]).to_frame(name=index_name).to_spark() sdf_after = self.to_frame(name=index_name)[loc:].to_spark() sdf = sdf_before.union(sdf_middle).union(sdf_after) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=[ InternalField(field.dtype) for field in self._internal.index_fields ], ) return DataFrame(internal).index
def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]: """ Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. The length of each input and output should be the same. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)] * len(pdf)) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.transform_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[float, float]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas friendly style as below: >>> def plus_one(x) -> ps.DataFrame['a': float, 'b': float]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]: ... return x + 1 When the given function returns DataFrame and has the return type annotated, the original index of the DataFrame will be lost and then a default index will be attached to the result. Please be careful about configuring the default index. See also `Default Index Type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- func : function Function to transform each pandas frame. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame or Series See Also -------- DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def plus_one_func(pdf) -> ps.DataFrame[int, int]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) c0 c1 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.DataFrame['A': int, 'B': int]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) A B 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.Series[int]: ... return pdf.B + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) 0 3 1 5 2 7 dtype: int64 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1) A B 0 2 3 1 4 5 2 6 7 >>> (df * -1).pandas_on_spark.transform_batch(abs) A B 0 1 2 1 3 4 2 5 6 Note that you should not transform the index. The index information will not change. >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1) 0 3 1 5 2 7 Name: B, dtype: int64 You can also specify extra arguments as below. >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3) 0 8 1 10 2 12 Name: B, dtype: int64 """ from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series from pyspark import pandas as ps assert callable(func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None original_func = func func = lambda o: original_func(o, *args, **kwargs) names = self._psdf._internal.to_internal_spark_frame.schema.names def pandas_concat(series): # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = names return pdf def apply_func(pdf): return func(pdf).to_frame() def pandas_extract(pdf, name): # This is for output to work around a DataFrame for struct # from Spark 3.0. See SPARK-23836 return pdf[name] def pandas_series_func(f): ff = f return lambda *series: first_series(ff(*series)) def pandas_frame_func(f, field_name): ff = f return lambda *series: pandas_extract(ff(pandas_concat(series)), field_name) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = ps.get_option("compute.shortcut_limit") pdf = self._psdf.head(limit + 1)._to_internal_pandas() transformed = func(pdf) if not isinstance(transformed, (pd.DataFrame, pd.Series)): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(transformed) ) if len(transformed) != len(pdf): raise ValueError("transform_batch cannot produce aggregated results") psdf_or_psser = ps.from_pandas(transformed) if isinstance(psdf_or_psser, ps.Series): psser = cast(ps.Series, psdf_or_psser) spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(psser.spark.data_type) ) return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)] ) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)( pandas_series_func(output_func) ) columns = self._psdf._internal.spark_columns # TODO: Index will be lost in this case. internal = self._psdf._internal.copy( column_labels=psser._internal.column_labels, data_spark_columns=[ pudf(F.struct(*columns)).alias(psser._internal.data_spark_column_names[0]) ], data_dtypes=psser._internal.data_dtypes, column_label_names=psser._internal.column_label_names, ) return first_series(DataFrame(internal)) else: psdf = cast(DataFrame, psdf_or_psser) if len(pdf) <= limit: # only do the short cut when it returns a frame to avoid # operations on different dataframes in case of series. return psdf # Force nullability. return_schema = force_decimal_precision_scale( as_nullable_spark_type(psdf._internal.to_internal_spark_frame.schema) ) self_applied = DataFrame(self._psdf._internal.resolved_copy) # type: DataFrame output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True ) columns = self_applied._internal.spark_columns pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)( output_func ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) return DataFrame(psdf._internal.with_new_sdf(sdf)) else: return_type = infer_return_type(original_func) is_return_series = isinstance(return_type, SeriesType) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe and not is_return_series: raise TypeError( "The given function should specify a frame or series as its type " "hints; however, the return type was %s." % return_sig ) if is_return_series: spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(cast(SeriesType, return_type).spark_type) ) return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)] ) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)( pandas_series_func(output_func) ) columns = self._psdf._internal.spark_columns internal = self._psdf._internal.copy( column_labels=[None], data_spark_columns=[pudf(F.struct(*columns)).alias(SPARK_DEFAULT_SERIES_NAME)], data_dtypes=[cast(SeriesType, return_type).dtype], column_label_names=None, ) return first_series(DataFrame(internal)) else: return_schema = cast(DataFrameType, return_type).spark_type self_applied = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=False ) columns = self_applied._internal.spark_columns pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)( output_func ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) internal = InternalFrame( spark_frame=sdf, index_spark_columns=None, data_dtypes=cast(DataFrameType, return_type).dtypes, ) return DataFrame(internal)
def apply_batch(self, func, args=(), **kwds) -> "DataFrame": """ Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. Koalas internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)]) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.koalas.apply_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... 10 83 11 83 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[float, float]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas friendly style as below: >>> def plus_one(x) -> ps.DataFrame["a": float, "b": float]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]: ... return x + 1 When the given function has the return type annotated, the original index of the DataFrame will be lost and a default index will be attached to the result DataFrame. Please be careful about configuring the default index. See also `Default Index Type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- func : function Function to apply to each pandas frame. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- DataFrame See Also -------- DataFrame.apply: For row/columnwise operations. DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Series.koalas.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def query_func(pdf) -> ps.DataFrame[int, int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) c0 c1 0 1 2 >>> def query_func(pdf) -> ps.DataFrame["A": int, "B": int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) A B 0 1 2 You can also omit the type hints so Koalas infers the return schema as below: >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1')) A B 0 1 2 You can also specify extra arguments. >>> def calculation(pdf, y, z) -> ps.DataFrame[int, int]: ... return pdf ** y + z >>> df.koalas.apply_batch(calculation, args=(10,), z=20) c0 c1 0 21 1044 1 59069 1048596 2 9765645 60466196 You can also use ``np.ufunc`` and built-in functions as input. >>> df.koalas.apply_batch(np.add, args=(10,)) A B 0 11 12 1 13 14 2 15 16 >>> (df * -1).koalas.apply_batch(abs) A B 0 1 2 1 3 4 2 5 6 """ # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate? from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark import pandas as ps if not isinstance(func, types.FunctionType): assert callable( func), "the first argument should be a callable function." f = func func = lambda *args, **kwargs: f(*args, **kwargs) spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0" original_func = func func = lambda o: original_func(o, *args, **kwds) self_applied = DataFrame( self._kdf._internal.resolved_copy) # type: DataFrame if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = ps.get_option("compute.shortcut_limit") pdf = self_applied.head(limit + 1)._to_internal_pandas() applied = func(pdf) if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(applied)) kdf = ps.DataFrame(applied) # type: DataFrame if len(pdf) <= limit: return kdf return_schema = force_decimal_precision_scale( as_nullable_spark_type( kdf._internal.to_internal_spark_frame.schema)) if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=True) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) else: return_type = infer_return_type(original_func) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " "hints; however, the return type was %s." % return_sig) return_schema = cast(DataFrameType, return_type).spark_type if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=False) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=False) # Otherwise, it loses index. internal = InternalFrame( spark_frame=sdf, index_spark_columns=None, data_dtypes=cast(DataFrameType, return_type).dtypes, ) return DataFrame(internal)
def insert(self, loc: int, item: Any) -> Index: """ Make new MultiIndex inserting new item at location. Follows Python list.append semantics for negative values. .. versionchanged:: 3.4.0 Raise IndexError when loc is out of bounds to follow Pandas 1.4+ behavior Parameters ---------- loc : int item : object Returns ------- new_index : MultiIndex Examples -------- >>> psmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> psmidx.insert(3, ("h", "j")) # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z'), ('h', 'j')], ) For negative values >>> psmidx.insert(-2, ("h", "j")) # doctest: +SKIP MultiIndex([('a', 'x'), ('h', 'j'), ('b', 'y'), ('c', 'z')], ) """ validate_index_loc(self, loc) loc = loc + len(self) if loc < 0 else loc index_name: List[Label] = [ (name, ) for name in self._internal.index_spark_column_names ] sdf_before = self.to_frame(name=index_name)[:loc]._to_spark() sdf_middle = Index([item]).to_frame(name=index_name)._to_spark() sdf_after = self.to_frame(name=index_name)[loc:]._to_spark() sdf = sdf_before.union(sdf_middle).union(sdf_after) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=[ InternalField(field.dtype) for field in self._internal.index_fields ], ) return DataFrame(internal).index
def combine_frames( this: "DataFrame", *args: DataFrameOrSeries, how: str = "full", preserve_order_column: bool = False ) -> "DataFrame": """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from pyspark.pandas.config import get_option from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import ( InternalField, InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, SPARK_INDEX_NAME_FORMAT, ) from pyspark.pandas.series import Series if all(isinstance(arg, Series) for arg in args): assert all( same_anchor(arg, args[0]) for arg in args ), "Currently only one different DataFrame (from given Series) is supported" assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this." that = args[0]._psdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) assert not same_anchor( this, args[0] ), "We don't need to combine. `this` and `that` are same." that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): def resolve(internal: InternalFrame, side: str) -> InternalFrame: rename = lambda col: "__{}_{}".format(side, col) internal = internal.resolved_copy sdf = internal.spark_frame sdf = internal.spark_frame.select( *[ scol_for(sdf, col).alias(rename(col)) for col in sdf.columns if col not in HIDDEN_COLUMNS ], *HIDDEN_COLUMNS ) return internal.copy( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names ], index_fields=[ field.copy(name=rename(field.name)) for field in internal.index_fields ], data_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names ], data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields], ) this_internal = resolve(this._internal, "this") that_internal = resolve(that._internal, "that") this_index_map = list( zip( this_internal.index_spark_column_names, this_internal.index_names, this_internal.index_fields, ) ) that_index_map = list( zip( that_internal.index_spark_column_names, that_internal.index_names, that_internal.index_fields, ) ) assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = list(zip(this_index_map, that_index_map)) this_sdf = this_internal.spark_frame.alias("this") that_sdf = that_internal.spark_frame.alias("that") # If the same named index is found, that's used. index_column_names = [] index_use_extension_dtypes = [] for ( i, ((this_column, this_name, this_field), (that_column, that_name, that_field)), ) in enumerate(this_and_that_index_map): if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this_sdf, this_column) that_scol = scol_for(that_sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) column_name = SPARK_INDEX_NAME_FORMAT(i) index_column_names.append(column_name) index_use_extension_dtypes.append( any(field.is_extension_dtype for field in [this_field, that_field]) ) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name) ) else: raise ValueError("Index names must be exactly matched currently.") assert len(join_scols) > 0, "cannot join with no overlapping index names" joined_df = this_sdf.join(that_sdf, on=join_scols, how=how) if preserve_order_column: order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)] else: order_column = [] joined_df = joined_df.select( *merged_index_scols, *( scol_for(this_sdf, this_internal.spark_column_name_for(label)) for label in this_internal.column_labels ), *( scol_for(that_sdf, that_internal.spark_column_name_for(label)) for label in that_internal.column_labels ), *order_column ) index_spark_columns = [scol_for(joined_df, col) for col in index_column_names] index_columns = set(index_column_names) new_data_columns = [ col for col in joined_df.columns if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME ] schema = joined_df.select(*index_spark_columns, *new_data_columns).schema index_fields = [ InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes) for struct_field, use_extension_dtypes in zip( schema.fields[: len(index_spark_columns)], index_use_extension_dtypes ) ] data_fields = [ InternalField.from_struct_field( struct_field, use_extension_dtypes=field.is_extension_dtype ) for struct_field, field in zip( schema.fields[len(index_spark_columns) :], this_internal.data_fields + that_internal.data_fields, ) ] level = max(this_internal.column_labels_level, that_internal.column_labels_level) def fill_label(label: Optional[Tuple]) -> List: if label is None: return ([""] * (level - 1)) + [None] else: return ([""] * (level - len(label))) + list(label) column_labels = [ tuple(["this"] + fill_label(label)) for label in this_internal.column_labels ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels] column_label_names = ( cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level) ) + this_internal.column_label_names return DataFrame( InternalFrame( spark_frame=joined_df, index_spark_columns=index_spark_columns, index_names=this_internal.index_names, index_fields=index_fields, column_labels=column_labels, data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns], data_fields=data_fields, column_label_names=column_label_names, ) ) else: raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
def symmetric_difference( # type: ignore[override] self, other: Index, result_name: Optional[List[Name]] = None, sort: Optional[bool] = None, ) -> "MultiIndex": """ Compute the symmetric difference of two MultiIndex objects. Parameters ---------- other : Index or array-like result_name : list sort : True or None, default None Whether to sort the resulting index. * True : Attempt to sort the result. * None : Do not sort the result. Returns ------- symmetric_difference : MiltiIndex Notes ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates dropped. Examples -------- >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [0, 0, 0, 0, 1, 2, 0, 1, 2]]) >>> midx2 = pd.MultiIndex([['pandas-on-Spark', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [0, 0, 0, 0, 1, 2, 0, 1, 2]]) >>> s1 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx1) >>> s2 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx2) >>> s1.index.symmetric_difference(s2.index) # doctest: +SKIP MultiIndex([('pandas-on-Spark', 'speed'), ( 'lama', 'speed')], ) You can set names of result Index. >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b']) # doctest: +SKIP MultiIndex([('pandas-on-Spark', 'speed'), ( 'lama', 'speed')], names=['a', 'b']) You can set sort to `True`, if you want to sort the resulting index. >>> s1.index.symmetric_difference(s2.index, sort=True) # doctest: +SKIP MultiIndex([('pandas-on-Spark', 'speed'), ( 'lama', 'speed')], ) You can also use the ``^`` operator: >>> s1.index ^ s2.index # doctest: +SKIP MultiIndex([('pandas-on-Spark', 'speed'), ( 'lama', 'speed')], ) """ if type(self) != type(other): raise NotImplementedError( "Doesn't support symmetric_difference between Index & MultiIndex for now" ) sdf_self = self._psdf._internal.spark_frame.select( self._internal.index_spark_columns) sdf_other = other._psdf._internal.spark_frame.select( other._internal.index_spark_columns) sdf_symdiff = sdf_self.union(sdf_other).subtract( sdf_self.intersect(sdf_other)) if sort: sdf_symdiff = sdf_symdiff.sort(*self._internal.index_spark_columns) internal = InternalFrame( spark_frame=sdf_symdiff, index_spark_columns=[ scol_for(sdf_symdiff, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=self._internal.index_fields, ) result = cast(MultiIndex, DataFrame(internal).index) if result_name: result.names = result_name return result
def _downsample(self, f: str) -> DataFrame: """ Downsample the defined function. Parameters ---------- how : string / mapped function **kwargs : kw args passed to how function """ # a simple example to illustrate the computation: # dates = [ # datetime.datetime(2012, 1, 2), # datetime.datetime(2012, 5, 3), # datetime.datetime(2022, 5, 3), # ] # index = pd.DatetimeIndex(dates) # pdf = pd.DataFrame(np.array([1,2,3]), index=index, columns=['A']) # pdf.resample('3Y').max() # A # 2012-12-31 2.0 # 2015-12-31 NaN # 2018-12-31 NaN # 2021-12-31 NaN # 2024-12-31 3.0 # # in this case: # 1, obtain one origin point to bin all timestamps, we can get one (2009-12-31) # from the minimum timestamp (2012-01-02); # 2, the default intervals for 'Y' are right-closed, so intervals are: # (2009-12-31, 2012-12-31], (2012-12-31, 2015-12-31], (2015-12-31, 2018-12-31], ... # 3, bin all timestamps, for example, 2022-05-03 belongs to interval # (2021-12-31, 2024-12-31], since the default label is 'right', label it with the right # edge 2024-12-31; # 4, some intervals maybe too large for this down sampling, so we need to pad the dataframe # to avoid missing some results, like: 2015-12-31, 2018-12-31 and 2021-12-31; # 5, union the binned dataframe and padded dataframe, and apply aggregation 'max' to get # the final results; # one action to obtain the range, in the future we may cache it in the index. ts_min, ts_max = (self._psdf._internal.spark_frame.select( F.min(self._resamplekey_scol), F.max(self._resamplekey_scol)).toPandas().iloc[0]) # the logic to obtain an origin point to bin the timestamps is too complex to follow, # here just use Pandas' resample on a 1-length series to get it. ts_origin = (pd.Series([0], index=[ts_min ]).resample(rule=self._offset.freqstr, closed=self._closed, label="left").sum().index[0]) assert ts_origin <= ts_min bin_col_name = "__tmp_resample_bin_col__" bin_col_label = verify_temp_column_name(self._psdf, bin_col_name) bin_col_field = InternalField( dtype=np.dtype("datetime64[ns]"), struct_field=StructField(bin_col_name, TimestampType(), True), ) bin_scol = self._bin_time_stamp( ts_origin, self._resamplekey_scol, ) agg_columns = [ psser for psser in self._agg_columns if (isinstance(psser.spark.data_type, NumericType)) ] assert len(agg_columns) > 0 # in the binning side, label the timestamps according to the origin and the freq(rule) bin_sdf = self._psdf._internal.spark_frame.select( F.col(SPARK_DEFAULT_INDEX_NAME), bin_scol.alias(bin_col_name), *[psser.spark.column for psser in agg_columns], ) # in the padding side, insert necessary points # again, directly apply Pandas' resample on a 2-length series to obtain the indices pad_sdf = (ps.from_pandas( pd.Series([0, 0], index=[ts_min, ts_max]).resample( rule=self._offset.freqstr, closed=self._closed, label=self._label).sum().index)._internal.spark_frame.select( F.col(SPARK_DEFAULT_INDEX_NAME).alias(bin_col_name)).where( (ts_min <= F.col(bin_col_name)) & (F.col(bin_col_name) <= ts_max))) # union the above two spark dataframes. sdf = bin_sdf.unionByName( pad_sdf, allowMissingColumns=True).where(~F.isnull(F.col(bin_col_name))) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], data_spark_columns=[F.col(bin_col_name)] + [ scol_for(sdf, psser._internal.data_spark_column_names[0]) for psser in agg_columns ], column_labels=[bin_col_label] + [psser._column_label for psser in agg_columns], data_fields=[bin_col_field] + [ psser._internal.data_fields[0].copy(nullable=True) for psser in agg_columns ], column_label_names=self._psdf._internal.column_label_names, ) psdf: DataFrame = DataFrame(internal) groupby = psdf.groupby(psdf._psser_for(bin_col_label), dropna=False) downsampled = getattr(groupby, f)() downsampled.index.name = None return downsampled