def _get_plot_backend(backend=None): backend = backend or get_option("plotting.backend") # Shortcut if backend in PandasOnSparkPlotAccessor._backends: return PandasOnSparkPlotAccessor._backends[backend] if backend == "matplotlib": # Because matplotlib is an optional dependency, # we need to attempt an import here to raise an ImportError if needed. try: # test if matplotlib can be imported import matplotlib # noqa: F401 from pyspark.pandas.plot import matplotlib as module except ImportError: raise ImportError( "matplotlib is required for plotting when the " "default backend 'matplotlib' is selected.") from None PandasOnSparkPlotAccessor._backends["matplotlib"] = module elif backend == "plotly": try: # test if plotly can be imported import plotly # noqa: F401 from pyspark.pandas.plot import plotly as module except ImportError: raise ImportError( "plotly is required for plotting when the " "default backend 'plotly' is selected.") from None PandasOnSparkPlotAccessor._backends["plotly"] = module else: module = PandasOnSparkPlotAccessor._find_backend(backend) PandasOnSparkPlotAccessor._backends[backend] = module return module
def get_sampled(self, data): from pyspark.pandas import DataFrame, Series fraction = get_option("plotting.sample_ratio") if fraction is None: fraction = 1 / (len(data) / get_option("plotting.max_rows")) fraction = min(1.0, fraction) self.fraction = fraction if isinstance(data, (DataFrame, Series)): if isinstance(data, Series): data = data.to_frame() sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction) return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas() else: raise ValueError("Only DataFrame and Series are supported for plotting.")
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if get_option("compute.eager_check") and index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy( dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): if get_option("compute.eager_check") and index_ops.hasnans: raise ValueError( "Cannot convert %s with missing values to integer" % self.pretty_name ) return _non_fractional_astype(index_ops, dtype, spark_type)
def set_result_text(self, ax): max_rows = get_option("plotting.max_rows") assert hasattr(self, "partial") if self.partial: ax.text( 1, 1, "showing top {} elements only".format(max_rows), size=6, ha="right", va="bottom", transform=ax.transAxes, )
def get_top_n(self, data): from pyspark.pandas import DataFrame, Series max_rows = get_option("plotting.max_rows") # Simply use the first 1k elements and make it into a pandas dataframe # For categorical variables, it is likely called from df.x.value_counts().plot.xxx(). if isinstance(data, (Series, DataFrame)): data = data.head(max_rows + 1).to_pandas() else: raise ValueError("Only DataFrame and Series are supported for plotting.") self.partial = False if len(data) > max_rows: self.partial = True data = data.iloc[:max_rows] return data
def combine_frames( this: "DataFrame", *args: DataFrameOrSeries, how: str = "full", preserve_order_column: bool = False ) -> "DataFrame": """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from pyspark.pandas.config import get_option from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import ( InternalField, InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, SPARK_INDEX_NAME_FORMAT, ) from pyspark.pandas.series import Series if all(isinstance(arg, Series) for arg in args): assert all( same_anchor(arg, args[0]) for arg in args ), "Currently only one different DataFrame (from given Series) is supported" assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this." that = args[0]._psdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) assert not same_anchor( this, args[0] ), "We don't need to combine. `this` and `that` are same." that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): def resolve(internal: InternalFrame, side: str) -> InternalFrame: rename = lambda col: "__{}_{}".format(side, col) internal = internal.resolved_copy sdf = internal.spark_frame sdf = internal.spark_frame.select( *[ scol_for(sdf, col).alias(rename(col)) for col in sdf.columns if col not in HIDDEN_COLUMNS ], *HIDDEN_COLUMNS ) return internal.copy( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names ], index_fields=[ field.copy(name=rename(field.name)) for field in internal.index_fields ], data_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names ], data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields], ) this_internal = resolve(this._internal, "this") that_internal = resolve(that._internal, "that") this_index_map = list( zip( this_internal.index_spark_column_names, this_internal.index_names, this_internal.index_fields, ) ) that_index_map = list( zip( that_internal.index_spark_column_names, that_internal.index_names, that_internal.index_fields, ) ) assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = list(zip(this_index_map, that_index_map)) this_sdf = this_internal.spark_frame.alias("this") that_sdf = that_internal.spark_frame.alias("that") # If the same named index is found, that's used. index_column_names = [] index_use_extension_dtypes = [] for ( i, ((this_column, this_name, this_field), (that_column, that_name, that_field)), ) in enumerate(this_and_that_index_map): if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this_sdf, this_column) that_scol = scol_for(that_sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) column_name = SPARK_INDEX_NAME_FORMAT(i) index_column_names.append(column_name) index_use_extension_dtypes.append( any(field.is_extension_dtype for field in [this_field, that_field]) ) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name) ) else: raise ValueError("Index names must be exactly matched currently.") assert len(join_scols) > 0, "cannot join with no overlapping index names" joined_df = this_sdf.join(that_sdf, on=join_scols, how=how) if preserve_order_column: order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)] else: order_column = [] joined_df = joined_df.select( *merged_index_scols, *( scol_for(this_sdf, this_internal.spark_column_name_for(label)) for label in this_internal.column_labels ), *( scol_for(that_sdf, that_internal.spark_column_name_for(label)) for label in that_internal.column_labels ), *order_column ) index_spark_columns = [scol_for(joined_df, col) for col in index_column_names] index_columns = set(index_column_names) new_data_columns = [ col for col in joined_df.columns if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME ] schema = joined_df.select(*index_spark_columns, *new_data_columns).schema index_fields = [ InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes) for struct_field, use_extension_dtypes in zip( schema.fields[: len(index_spark_columns)], index_use_extension_dtypes ) ] data_fields = [ InternalField.from_struct_field( struct_field, use_extension_dtypes=field.is_extension_dtype ) for struct_field, field in zip( schema.fields[len(index_spark_columns) :], this_internal.data_fields + that_internal.data_fields, ) ] level = max(this_internal.column_labels_level, that_internal.column_labels_level) def fill_label(label: Optional[Tuple]) -> List: if label is None: return ([""] * (level - 1)) + [None] else: return ([""] * (level - len(label))) + list(label) column_labels = [ tuple(["this"] + fill_label(label)) for label in this_internal.column_labels ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels] column_label_names = ( cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level) ) + this_internal.column_label_names return DataFrame( InternalFrame( spark_frame=joined_df, index_spark_columns=index_spark_columns, index_names=this_internal.index_names, index_fields=index_fields, column_labels=column_labels, data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns], data_fields=data_fields, column_label_names=column_label_names, ) ) else: raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)