def _verify_for_rename(self, name): if is_list_like(name): if self._internal.index_level != len(name): raise ValueError( "Length of new names must be {}, got {}".format( self._internal.index_level, len(name))) if any(not is_hashable(n) for n in name): raise TypeError("MultiIndex.name must be a hashable type") return [n if is_name_like_tuple(n) else (n, ) for n in name] else: raise TypeError("Must pass list-like as `names`.")
def attach_id_column(self, id_type: str, column: Union[Any, Tuple]) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ks.DataFrame({"x": ['a', 'b', 'c']}) >>> df.koalas.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.koalas.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.koalas.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ks.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.koalas.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.koalas.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from databricks.koalas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._kdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_dtypes=internal.index_dtypes, column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_dtypes=(internal.data_dtypes + [None]), column_label_names=internal.column_label_names, ).resolved_copy )
def to_frame(self, index=True, name=None) -> DataFrame: """ Create a DataFrame with the levels of the MultiIndex as columns. Column ordering is determined by the DataFrame constructor with data as a dict. Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original MultiIndex. name : list / sequence of strings, optional The passed names should substitute index level names. Returns ------- DataFrame : a DataFrame containing the original MultiIndex data. See Also -------- DataFrame Examples -------- >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> idx = ks.MultiIndex.from_tuples(tuples, names=('number', 'color')) >>> idx # doctest: +SKIP MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'blue')], names=['number', 'color']) >>> idx.to_frame() # doctest: +NORMALIZE_WHITESPACE number color number color 1 red 1 red blue 1 blue 2 red 2 red blue 2 blue By default, the original Index is reused. To enforce a new Index: >>> idx.to_frame(index=False) number color 0 1 red 1 1 blue 2 2 red 3 2 blue To override the name of the resulting column, specify `name`: >>> idx.to_frame(name=['n', 'c']) # doctest: +NORMALIZE_WHITESPACE n c number color 1 red 1 red blue 1 blue 2 red 2 red blue 2 blue """ if name is None: name = [ name if name is not None else (i, ) for i, name in enumerate(self._internal.index_names) ] elif is_list_like(name): if len(name) != self._internal.index_level: raise ValueError( "'name' should have same length as number of levels on index." ) name = [n if is_name_like_tuple(n) else (n, ) for n in name] else: raise TypeError( "'name' must be a list / sequence of column names.") return self._to_frame(index=index, names=name)
def from_frame(df, names=None) -> "MultiIndex": """ Make a MultiIndex from a DataFrame. Parameters ---------- df : DataFrame DataFrame to be converted to MultiIndex. names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. Returns ------- MultiIndex The MultiIndex representation of the given DataFrame. See Also -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. Examples -------- >>> df = ks.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) >>> df # doctest: +SKIP a b 0 HI Temp 1 HI Precip 2 NJ Temp 3 NJ Precip >>> ks.MultiIndex.from_frame(df) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['a', 'b']) Using explicit names, instead of the column names >>> ks.MultiIndex.from_frame(df, names=['state', 'observation']) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['state', 'observation']) """ if not isinstance(df, DataFrame): raise TypeError("Input must be a DataFrame") sdf = df.to_spark() if names is None: names = df._internal.column_labels elif not is_list_like(names): raise ValueError("Names should be list-like for a MultiIndex") else: names = [ name if is_name_like_tuple(name) else (name, ) for name in names ] internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, col) for col in sdf.columns], index_names=names, ) return cast(MultiIndex, DataFrame(internal).index)
def __init__( self, spark_frame: spark.DataFrame, index_map: Optional[Dict[str, Optional[Tuple]]], column_labels: Optional[List[Tuple]] = None, data_spark_columns: Optional[List[spark.Column]] = None, column_label_names: Optional[List[Optional[Tuple]]] = None, ) -> None: """ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and index fields and names. :param spark_frame: Spark DataFrame to be managed. :param index_map: dictionary of string pairs Each pair holds the index field name which exists in Spark fields, and the index name. :param column_labels: list of tuples with the same length The multi-level values in the tuples. :param data_spark_columns: list of Spark Column Spark Columns to appear as columns. If spark_column is not None, this argument is ignored, otherwise if this is None, calculated from spark_frame. :param column_label_names: Names for each of the column index levels. See the examples below to refer what each parameter means. >>> column_labels = pd.MultiIndex.from_tuples( ... [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"]) >>> row_index = pd.MultiIndex.from_tuples( ... [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')], ... names=["row_index_a", "row_index_b"]) >>> kdf = ks.DataFrame( ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels) >>> kdf.set_index(('a', 'x'), append=True, inplace=True) >>> kdf # doctest: +NORMALIZE_WHITESPACE column_labels_a a b column_labels_b y z row_index_a row_index_b (a, x) foo bar 1 2 3 4 5 6 zoo bar 7 8 9 >>> internal = kdf._internal >>> internal._sdf.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +-----------------+-----------------+------+------+------+... |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|... +-----------------+-----------------+------+------+------+... | foo| bar| 1| 2| 3|... | foo| bar| 4| 5| 6|... | zoo| bar| 7| 8| 9|... +-----------------+-----------------+------+------+------+... >>> internal._index_map # doctest: +NORMALIZE_WHITESPACE OrderedDict([('__index_level_0__', ('row_index_a',)), ('__index_level_1__', ('row_index_b',)), ('(a, x)', ('a', 'x'))]) >>> internal._column_labels [('a', 'y'), ('b', 'z')] >>> internal._data_spark_columns [Column<b'(a, y)'>, Column<b'(b, z)'>] >>> internal._column_label_names [('column_labels_a',), ('column_labels_b',)] """ assert isinstance(spark_frame, spark.DataFrame) assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming." if index_map is None: assert not any( SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid " "index column names [%s]." % SPARK_INDEX_NAME_PATTERN) if data_spark_columns is not None: spark_frame = spark_frame.select(data_spark_columns) # Create default index. spark_frame = InternalFrame.attach_default_index(spark_frame) index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}) if data_spark_columns is not None: data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col != SPARK_DEFAULT_INDEX_NAME ] if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns: spark_frame = spark_frame.withColumn( NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()) assert isinstance(index_map, OrderedDict), index_map assert all( isinstance(index_field, str) and is_name_like_tuple(index_name, check_type=True) for index_field, index_name in index_map.items()), index_map assert data_spark_columns is None or all( isinstance(scol, spark.Column) for scol in data_spark_columns) self._sdf = spark_frame # type: spark.DataFrame self._index_map = index_map # type: Dict[str, Optional[Tuple]] if data_spark_columns is None: index_columns = set(index_column for index_column in self._index_map) self._data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col not in index_columns and col not in HIDDEN_COLUMNS ] else: self._data_spark_columns = data_spark_columns if column_labels is None: self._column_labels = [ (col, ) for col in spark_frame.select(self._data_spark_columns).columns ] # type: List[Tuple] else: assert len(column_labels) == len(self._data_spark_columns), ( len(column_labels), len(self._data_spark_columns), ) if len(column_labels) == 1: column_label = column_labels[0] assert is_name_like_tuple(column_label, check_type=True), column_label else: assert all( is_name_like_tuple(column_label, check_type=True) for column_label in column_labels), column_labels assert len(set(len(label) for label in column_labels)) <= 1, column_labels self._column_labels = column_labels if column_label_names is None: self._column_label_names = [None] * column_labels_level( self._column_labels) # type: List[Optional[Tuple]] else: if len(self._column_labels) > 0: assert len(column_label_names) == column_labels_level( self._column_labels), ( len(column_label_names), column_labels_level(self._column_labels), ) else: assert len(column_label_names) > 0, len(column_label_names) assert all( is_name_like_tuple(column_label_name, check_type=True) for column_label_name in column_label_names), column_label_names self._column_label_names = column_label_names