def _is_monotonic_increasing(self) -> Series: window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) cond = SF.lit(True) has_not_null = SF.lit(True) for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) compare = MultiIndex._comparator_for_monotonic_increasing( data_type) # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over the all levels. has_not_null = has_not_null & scol.isNotNull() cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__gt__)) cond = has_not_null & (prev.isNull() | cond) cond_name = verify_temp_column_name( self._internal.spark_frame.select( self._internal.index_spark_columns), "__is_monotonic_increasing_cond__", ) sdf = self._internal.spark_frame.select( self._internal.index_spark_columns + [cond.alias(cond_name)]) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=self._internal.index_fields, ) return first_series(DataFrame(internal))
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: """Cast `index_ops` to BooleanType Spark type, given `dtype`.""" spark_type = BooleanType() if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type))
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) # Note that date subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction. msg = ("Note that there is a behavior difference of date subtraction. " "The date subtraction returns an integer in days, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, datetime.date) and not isinstance( right, datetime.datetime): warnings.warn(msg, UserWarning) return -column_op(F.datediff)(left, SF.lit(right)).astype("long") else: raise TypeError( "Date subtraction can only be applied to date series.")
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: """Cast `index_ops` to BooleanType Spark type, given `dtype`.""" from pyspark.pandas.internal import InternalField if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(BooleanType()) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( index_ops.spark.column.cast(BooleanType())) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), )
def floordiv(left: Column, right: Any) -> Column: return F.when(SF.lit(right is np.nan), np.nan).otherwise( F.when( SF.lit(right != 0) | SF.lit(right).isNull(), F.floor(left.__div__(right)) ).otherwise( F.when(SF.lit(left == np.inf) | SF.lit(left == -np.inf), left).otherwise( SF.lit(np.inf).__div__(left) ) ) )
def rsub(self, left: T_IndexOps, right: Any) -> IndexOpsLike: # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. msg = ( "Note that there is a behavior difference of timestamp subtraction. " "The timestamp subtraction returns an integer in seconds, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, datetime.datetime): warnings.warn(msg, UserWarning) return cast( IndexOpsLike, left.spark.transform(lambda scol: SF.lit(right).cast( as_spark_type("long")) - scol.astype("long")), ) else: raise TypeError( "datetime subtraction can only be applied to datetime series.")
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. msg = ( "Note that there is a behavior difference of timestamp subtraction. " "The timestamp subtraction returns an integer in seconds, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, datetime.datetime): warnings.warn(msg, UserWarning) return cast( SeriesOrIndex, left._with_new_scol( SF.lit(right).cast(LongType()) - left.spark.column.cast(LongType()), field=left._internal.data_fields[0].copy( dtype=np.dtype("int64"), spark_type=LongType()), ), ) else: raise TypeError( "Datetime subtraction can only be applied to datetime series.")
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( F.length(index_ops.spark.column) > 0 ) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None) return _as_string_type(index_ops, dtype, null_str=null_str) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( F.length(index_ops.spark.column) > 0) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype) else: return _as_other_type(index_ops, dtype, spark_type)
def test_lit(self): self.assertTrue( spark_column_equals(SF.lit(np.int64(1)), F.lit(1).astype(LongType()))) self.assertTrue( spark_column_equals(SF.lit(np.int32(1)), F.lit(1).astype(IntegerType()))) self.assertTrue( spark_column_equals(SF.lit(np.int8(1)), F.lit(1).astype(ByteType()))) self.assertTrue( spark_column_equals(SF.lit(np.byte(1)), F.lit(1).astype(ByteType()))) self.assertTrue( spark_column_equals(SF.lit(np.float32(1)), F.lit(float(1)).astype(FloatType()))) self.assertTrue(spark_column_equals(SF.lit(1), F.lit(1)))
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if get_option("compute.eager_check"): if is_integer_dtype(dtype) and not isinstance( dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name) elif is_bool_dtype(dtype) and not isinstance( dtype, extension_dtypes): if index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to bool" % self.pretty_name) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy( dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def convert_arguments(*args): args = [ SF.lit(inp) if not isinstance(inp, Column) else inp for inp in args ] return np_spark_map_func(*args)
def pow_func(left: Column, right: Any) -> Column: return ( F.when(left == 1, left) .when(SF.lit(right) == 0, 1) .otherwise(Column.__pow__(left, right)) )
def rfloordiv(left: Column, right: Any) -> Column: return F.when(SF.lit(left == 0), SF.lit(np.inf).__div__(right)).otherwise( F.when(SF.lit(left) == np.nan, np.nan).otherwise( F.floor(SF.lit(right).__div__(left)) ) )
def rtruediv(left: Column, right: Any) -> Column: return F.when(left == 0, SF.lit(np.inf).__div__(right)).otherwise( SF.lit(right).__truediv__(left) )
def truediv(left: Column, right: Any) -> Column: return F.when( SF.lit(right != 0) | SF.lit(right).isNull(), left.__div__(right) ).otherwise(SF.lit(np.inf).__div__(left))
def rpow_func(left: Column, right: Any) -> Column: return F.when(SF.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
def align_diff_frames( resolve_func: Callable[ ["DataFrame", List[Tuple], List[Tuple]], Iterator[Tuple["Series", Tuple]] ], this: "DataFrame", that: "DataFrame", fillna: bool = True, how: str = "full", preserve_order_column: bool = False, ) -> "DataFrame": """ This method aligns two different DataFrames with a given `func`. Columns are resolved and handled within the given `func`. To use this, `compute.ops_on_diff_frames` should be True, for now. :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and the column of another DataFrame. It returns an iterable that produces Series. >>> from pyspark.pandas.config import set_option, reset_option >>> >>> set_option("compute.ops_on_diff_frames", True) >>> >>> psdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> psdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> >>> def func(psdf, this_column_labels, that_column_labels): ... psdf # conceptually this is A + B. ... ... # Within this function, Series from A or B can be performed against `psdf`. ... this_label = this_column_labels[0] # this is ('a',) from psdf1. ... that_label = that_column_labels[0] # this is ('a',) from psdf2. ... new_series = (psdf[this_label] - psdf[that_label]).rename(str(this_label)) ... ... # This new series will be placed in new DataFrame. ... yield (new_series, this_label) >>> >>> >>> align_diff_frames(func, psdf1, psdf2).sort_index() a 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 >>> reset_option("compute.ops_on_diff_frames") :param this: a DataFrame to align :param that: another DataFrame to align :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`. Otherwise, it returns as are. :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict. - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and 'that_columns' in this function are B, C and B, C. - left: `resolve_func` should resolve columns including that columns. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is B, C but `that_columns` are B, C, D. - inner: Same as 'full' mode; however, internally performs inner join instead. :return: Aligned DataFrame """ from pyspark.pandas.frame import DataFrame assert how == "full" or how == "left" or how == "inner" this_column_labels = this._internal.column_labels that_column_labels = that._internal.column_labels common_column_labels = set(this_column_labels).intersection(that_column_labels) # 1. Perform the join given two dataframes. combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column) # 2. Apply the given function to transform the columns in a batch and keep the new columns. combined_column_labels = combined._internal.column_labels that_columns_to_apply = [] # type: List[Tuple] this_columns_to_apply = [] # type: List[Tuple] additional_that_columns = [] # type: List[Tuple] columns_to_keep = [] # type: List[Union[Series, Column]] column_labels_to_keep = [] # type: List[Tuple] for combined_label in combined_column_labels: for common_label in common_column_labels: if combined_label == tuple(["this", *common_label]): this_columns_to_apply.append(combined_label) break elif combined_label == tuple(["that", *common_label]): that_columns_to_apply.append(combined_label) break else: if how == "left" and combined_label in [ tuple(["that", *label]) for label in that_column_labels ]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) # is intentional so that `this_columns` and `that_columns` can be paired. additional_that_columns.append(combined_label) elif fillna: columns_to_keep.append(SF.lit(None).cast(DoubleType()).alias(str(combined_label))) column_labels_to_keep.append(combined_label) else: columns_to_keep.append(combined._psser_for(combined_label)) column_labels_to_keep.append(combined_label) that_columns_to_apply += additional_that_columns # Should extract columns to apply and do it in a batch in case # it adds new columns for example. if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0: psser_set, column_labels_set = zip( *resolve_func(combined, this_columns_to_apply, that_columns_to_apply) ) columns_applied = list(psser_set) # type: List[Union[Series, Column]] column_labels_applied = list(column_labels_set) # type: List[Tuple] else: columns_applied = [] column_labels_applied = [] applied = DataFrame( combined._internal.with_new_columns( columns_applied + columns_to_keep, column_labels=column_labels_applied + column_labels_to_keep, ) ) # type: DataFrame # 3. Restore the names back and deduplicate columns. this_labels = OrderedDict() # Add columns in an order of its original frame. for this_label in this_column_labels: for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels and this_label == new_label[1:]: this_labels[new_label[1:]] = new_label # After that, we will add the rest columns. other_labels = OrderedDict() for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels: other_labels[new_label[1:]] = new_label psdf = applied[list(this_labels.values()) + list(other_labels.values())] psdf.columns = psdf.columns.droplevel() return psdf
def rpow_func(left: Column, right: Any) -> Column: return (F.when(left.isNull(), np.nan).when(SF.lit(right == 1), right).otherwise( Column.__rpow__(left, right)))
def or_func(left: Column, right: Any) -> Column: if not isinstance(right, Column) and pd.isna(right): return SF.lit(False) else: scol = left | SF.lit(right) return F.when(left.isNull() | scol.isNull(), False).otherwise(scol)
def compute_hist(psdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) sdf = psdf._internal.spark_frame scols = [] input_column_names = [] for label in psdf._internal.column_labels: input_column_name = name_like_string(label) input_column_names.append(input_column_name) scols.append(psdf._internal.spark_column_for(label).alias(input_column_name)) sdf = sdf.select(*scols) # 1. Make the bucket output flat to: # +----------+-------+ # |__group_id|buckets| # +----------+-------+ # |0 |0.0 | # |0 |0.0 | # |0 |1.0 | # |0 |2.0 | # |0 |3.0 | # |0 |3.0 | # |1 |0.0 | # |1 |1.0 | # |1 |1.0 | # |1 |2.0 | # |1 |1.0 | # |1 |0.0 | # +----------+-------+ colnames = sdf.columns bucket_names = ["__{}_bucket".format(colname) for colname in colnames] output_df = None for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer( splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" ) bucket_df = bucketizer.transform(sdf) if output_df is None: output_df = bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket") ) else: output_df = output_df.union( bucket_df.select( SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket") ) ) # 2. Calculate the count based on each group and bucket. # +----------+-------+------+ # |__group_id|buckets| count| # +----------+-------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+-------+------+ result = ( output_df.groupby("__group_id", "__bucket") .agg(F.count("*").alias("count")) .toPandas() .sort_values(by=["__group_id", "__bucket"]) ) # 3. Fill empty bins and calculate based on each group id. From: # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |0 |0.0 |2 | # |0 |1.0 |1 | # |0 |2.0 |1 | # |0 |3.0 |2 | # +----------+--------+------+ # +----------+--------+------+ # |__group_id|__bucket| count| # +----------+--------+------+ # |1 |0.0 |2 | # |1 |1.0 |3 | # |1 |2.0 |1 | # +----------+--------+------+ # # to: # +-----------------+ # |__values1__bucket| # +-----------------+ # |2 | # |1 | # |1 | # |2 | # |0 | # +-----------------+ # +-----------------+ # |__values2__bucket| # +-----------------+ # |2 | # |3 | # |1 | # |0 | # |0 | # +-----------------+ output_series = [] for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)): current_bucket_result = result[result["__group_id"] == i] # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[ ["count"] ] pdf.columns = [input_column_name] output_series.append(pdf[input_column_name]) return output_series
def set_categories( self, new_categories: Union[pd.Index, List], ordered: Optional[bool] = None, rename: bool = False, inplace: bool = False, ) -> Optional["ps.Series"]: """ Set the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to NaN). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to NaN or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string dtypes, which does not considers a S1 string equal to a single char python string. Parameters ---------- new_categories : Index-like The categories in new order. ordered : bool, default False Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False Whether or not the new_categories should be considered as a rename of the old categories or as reordered categories. inplace : bool, default False Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. .. deprecated:: 3.2.0 Returns ------- Series with reordered categories or None if inplace. Raises ------ ValueError If new_categories does not validate as categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.set_categories(['b', 'c']) # doctest: +SKIP 0 NaN 1 b 2 b 3 c 4 c 5 c dtype: category Categories (2, object): ['b', 'c'] >>> s.cat.set_categories([1, 2, 3], rename=True) # doctest: +SKIP 0 1 1 2 2 2 3 3 4 3 5 3 dtype: category Categories (3, int64): [1, 2, 3] >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True) # doctest: +SKIP 0 1 1 2 2 2 3 3 4 3 5 3 dtype: category Categories (3, int64): [1 < 2 < 3] """ from pyspark.pandas.frame import DataFrame if inplace: warnings.warn( "The `inplace` parameter in set_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) if not is_list_like(new_categories): raise TypeError( "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories) ) if ordered is None: ordered = self.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) scol = self._data.spark.column if rename: new_scol = ( F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type)) .otherwise(scol) .alias(self._data._internal.data_spark_column_names[0]) ) internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, new_scol, field=self._data._internal.data_fields[0].copy(dtype=new_dtype), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: return DataFrame(internal)._psser_for(self._data._column_label).copy() else: psser = self._data.astype(new_dtype) if inplace: internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, psser.spark.column, field=psser._internal.data_fields[0], ) self._data._psdf._update_internal_frame(internal) return None else: return psser