def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIndex": """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. Parameters ---------- other : Index or array-like Returns ------- intersection : MultiIndex Examples -------- >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) >>> midx1.intersection(midx2).sort_values() # doctest: +SKIP MultiIndex([('c', 'z')], ) """ if isinstance(other, Series) or not is_list_like(other): raise TypeError("other must be a MultiIndex or a list of tuples") elif isinstance(other, DataFrame): raise ValueError("Index data must be 1-dimensional") elif isinstance(other, MultiIndex): spark_frame_other = other.to_frame().to_spark() keep_name = self.names == other.names elif isinstance(other, Index): # Always returns an empty MultiIndex if `other` is Index. return cast(MultiIndex, self.to_frame().head(0).index) elif not all(isinstance(item, tuple) for item in other): raise TypeError("other must be a MultiIndex or a list of tuples") else: other = MultiIndex.from_tuples(list(other)) spark_frame_other = cast(MultiIndex, other).to_frame().to_spark() keep_name = True index_fields = self._index_fields_for_union_like(other, func_name="intersection") default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)] spark_frame_self = self.to_frame(name=default_name).to_spark() spark_frame_intersected = spark_frame_self.intersect(spark_frame_other) if keep_name: index_names = self._internal.index_names else: index_names = None internal = InternalFrame( spark_frame=spark_frame_intersected, index_spark_columns=[ scol_for(spark_frame_intersected, cast(str, col)) for col in default_name ], index_names=index_names, index_fields=index_fields, ) return cast(MultiIndex, DataFrame(internal).index)
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType(List(StructField(c0,DoubleType,true))) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category,struct_field=StructField(index,LongType,true))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string spec = getfullargspec(f) tpe = spec.annotations.get("return", None) if isinstance(tpe, str): # This type hint can happen when given hints are string to avoid forward reference. tpe = resolve_string_type_hint(tpe) if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series): # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints. tpe = tpe.__args__[0] if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`. # Check if the name is Tuple. name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) if name == "Tuple": tuple_type = tpe if hasattr(tuple_type, "__tuple_params__"): # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead. # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py parameters = getattr(tuple_type, "__tuple_params__") else: parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def combine_frames( this: "DataFrame", *args: DataFrameOrSeries, how: str = "full", preserve_order_column: bool = False ) -> "DataFrame": """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from pyspark.pandas.config import get_option from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import ( InternalField, InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, SPARK_INDEX_NAME_FORMAT, ) from pyspark.pandas.series import Series if all(isinstance(arg, Series) for arg in args): assert all( same_anchor(arg, args[0]) for arg in args ), "Currently only one different DataFrame (from given Series) is supported" assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this." that = args[0]._psdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) assert not same_anchor( this, args[0] ), "We don't need to combine. `this` and `that` are same." that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): def resolve(internal: InternalFrame, side: str) -> InternalFrame: rename = lambda col: "__{}_{}".format(side, col) internal = internal.resolved_copy sdf = internal.spark_frame sdf = internal.spark_frame.select( *[ scol_for(sdf, col).alias(rename(col)) for col in sdf.columns if col not in HIDDEN_COLUMNS ], *HIDDEN_COLUMNS ) return internal.copy( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names ], index_fields=[ field.copy(name=rename(field.name)) for field in internal.index_fields ], data_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names ], data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields], ) this_internal = resolve(this._internal, "this") that_internal = resolve(that._internal, "that") this_index_map = list( zip( this_internal.index_spark_column_names, this_internal.index_names, this_internal.index_fields, ) ) that_index_map = list( zip( that_internal.index_spark_column_names, that_internal.index_names, that_internal.index_fields, ) ) assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = list(zip(this_index_map, that_index_map)) this_sdf = this_internal.spark_frame.alias("this") that_sdf = that_internal.spark_frame.alias("that") # If the same named index is found, that's used. index_column_names = [] index_use_extension_dtypes = [] for ( i, ((this_column, this_name, this_field), (that_column, that_name, that_field)), ) in enumerate(this_and_that_index_map): if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this_sdf, this_column) that_scol = scol_for(that_sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) column_name = SPARK_INDEX_NAME_FORMAT(i) index_column_names.append(column_name) index_use_extension_dtypes.append( any(field.is_extension_dtype for field in [this_field, that_field]) ) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name) ) else: raise ValueError("Index names must be exactly matched currently.") assert len(join_scols) > 0, "cannot join with no overlapping index names" joined_df = this_sdf.join(that_sdf, on=join_scols, how=how) if preserve_order_column: order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)] else: order_column = [] joined_df = joined_df.select( *merged_index_scols, *( scol_for(this_sdf, this_internal.spark_column_name_for(label)) for label in this_internal.column_labels ), *( scol_for(that_sdf, that_internal.spark_column_name_for(label)) for label in that_internal.column_labels ), *order_column ) index_spark_columns = [scol_for(joined_df, col) for col in index_column_names] index_columns = set(index_column_names) new_data_columns = [ col for col in joined_df.columns if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME ] schema = joined_df.select(*index_spark_columns, *new_data_columns).schema index_fields = [ InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes) for struct_field, use_extension_dtypes in zip( schema.fields[: len(index_spark_columns)], index_use_extension_dtypes ) ] data_fields = [ InternalField.from_struct_field( struct_field, use_extension_dtypes=field.is_extension_dtype ) for struct_field, field in zip( schema.fields[len(index_spark_columns) :], this_internal.data_fields + that_internal.data_fields, ) ] level = max(this_internal.column_labels_level, that_internal.column_labels_level) def fill_label(label: Optional[Tuple]) -> List: if label is None: return ([""] * (level - 1)) + [None] else: return ([""] * (level - len(label))) + list(label) column_labels = [ tuple(["this"] + fill_label(label)) for label in this_internal.column_labels ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels] column_label_names = ( cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level) ) + this_internal.column_label_names return DataFrame( InternalFrame( spark_frame=joined_df, index_spark_columns=index_spark_columns, index_names=this_internal.index_names, index_fields=index_fields, column_labels=column_labels, data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns], data_fields=data_fields, column_label_names=column_label_names, ) ) else: raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
def attach_id_column(self, id_type: str, column: Name) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ps.DataFrame({"x": ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from pyspark.pandas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._psdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_fields=internal.data_fields + [ InternalField.from_struct_field( StructField(name_like_string(column), LongType(), nullable=False) ) ], column_label_names=internal.column_label_names, ).resolved_copy )
def infer_return_type( f: Callable ) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]: """ Infer the return type from the return type annotation of the given function. The returned type class indicates both dtypes (a pandas only dtype object or a numpy dtype object) and its corresponding Spark DataType. >>> def func() -> int: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.Series[int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.Series[int]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype dtype('int64') >>> inferred.spark_type LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('<U')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), ... [("id", int), ("A", int)]]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')] >>> inferred.spark_type.simpleString() 'struct<index:bigint,id:bigint,A:bigint>' >>> inferred.index_fields [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class # canonically. from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder from pyspark.pandas.utils import name_like_string tpe = get_type_hints(f).get("return", None) if tpe is None: raise ValueError("A return value is required for the input function") if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType): tpe = tpe.__args__[0] if issubclass(tpe, NameTypeHolder): tpe = tpe.tpe dtype, spark_type = pandas_on_spark_type(tpe) return SeriesType(dtype, spark_type) # Note that, DataFrame type hints will create a Tuple. # Tuple has _name but other types have __name__ name = getattr(tpe, "_name", getattr(tpe, "__name__", None)) # Check if the name is Tuple. if name == "Tuple": tuple_type = tpe parameters = getattr(tuple_type, "__args__") index_parameters = [ p for p in parameters if isclass(p) and issubclass(p, IndexNameTypeHolder) ] data_parameters = [p for p in parameters if p not in index_parameters] assert len( data_parameters) > 0, "Type hints for data must not be empty." index_fields = [] if len(index_parameters) >= 1: for level, index_parameter in enumerate(index_parameters): index_name = index_parameter.name index_dtype, index_spark_type = pandas_on_spark_type( index_parameter.tpe) index_fields.append( InternalField( dtype=index_dtype, struct_field=types.StructField( name=index_name if index_name is not None else SPARK_INDEX_NAME_FORMAT(level), dataType=index_spark_type, ), )) else: # No type hint for index. assert len(index_parameters) == 0 data_dtypes, data_spark_types = zip( *(pandas_on_spark_type(p.tpe) if isclass(p) and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p) for p in data_parameters)) data_names = [ p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in data_parameters ] data_fields = [] for i, (data_name, data_dtype, data_spark_type) in enumerate( zip(data_names, data_dtypes, data_spark_types)): data_fields.append( InternalField( dtype=data_dtype, struct_field=types.StructField( name=name_like_string(data_name) if data_name is not None else ("c%s" % i), dataType=data_spark_type, ), )) return DataFrameType(index_fields=index_fields, data_fields=data_fields) tpes = pandas_on_spark_type(tpe) if tpes is None: return UnknownType(tpe) else: return ScalarType(*tpes)
def test_from_pandas(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [("a", ), ("b", )]) self.assert_eq(internal.data_spark_column_names, ["a", "b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("a", )), sdf["a"])) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # non-string column name pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf1) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [(0, ), (1, )]) self.assert_eq(internal.data_spark_column_names, ["0", "1"]) self.assertTrue( spark_column_equals(internal.spark_column_for((0, )), sdf["0"])) self.assertTrue( spark_column_equals(internal.spark_column_for((1, )), sdf["1"])) self.assert_eq(internal.to_pandas_frame, pdf1) # multi-index pdf.set_index("a", append=True, inplace=True) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("b", )]) self.assert_eq(internal.data_spark_column_names, ["b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("x", "b")]) self.assert_eq(internal.data_spark_column_names, ["(x, b)"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("x", "b")), sdf["(x, b)"])) self.assert_eq(internal.to_pandas_frame, pdf)