def corr(kdf: "ks.DataFrame", method: str = "pearson") -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. Only accepts scalar numerical values for now. :param kdf: the Koalas dataframe. :param method: {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation :return: :class:`pandas.DataFrame` >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr() A B A 1.0 -1.0 B -1.0 1.0 """ assert method in ("pearson", "spearman") ndf, column_labels = to_numeric_df(kdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = corr.toPandas() arr = pcorr.iloc[0, 0].toArray() if column_labels_level(column_labels) > 1: idx = pd.MultiIndex.from_tuples(column_labels) else: idx = pd.Index([label[0] for label in column_labels]) return pd.DataFrame(arr, columns=idx, index=idx)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list( self._kdf_or_kser.columns)] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series( self._internal.copy(scol=column_scols[0], column_labels=[column_labels[0]]), anchor=self._kdf_or_kser, ) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] index_map = self._internal.index_map[-remaining_index:] else: index_scols = self._internal.index_scols index_map = self._internal.index_map if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names level = column_labels_level(column_labels) column_label_names = self._internal.column_label_names[-level:] try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(index_scols + column_scols) except AnalysisException: raise KeyError("[{}] don't exist in columns".format( [col._jc.toString() for col in column_scols])) internal = _InternalFrame( sdf=sdf, index_map=index_map, column_labels=column_labels, column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def column_labels_level(self) -> int: """ Return the level of the column index. """ return column_labels_level(self._column_labels)
def __init__( self, spark_frame: spark.DataFrame, index_map: Optional[Dict[str, Optional[Tuple]]], column_labels: Optional[List[Tuple]] = None, data_spark_columns: Optional[List[spark.Column]] = None, column_label_names: Optional[List[Optional[Tuple[str, ...]]]] = None, ) -> None: """ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and index fields and names. :param spark_frame: Spark DataFrame to be managed. :param index_map: dictionary of string pairs Each pair holds the index field name which exists in Spark fields, and the index name. :param column_labels: list of tuples with the same length The multi-level values in the tuples. :param data_spark_columns: list of Spark Column Spark Columns to appear as columns. If spark_column is not None, this argument is ignored, otherwise if this is None, calculated from spark_frame. :param column_label_names: Names for each of the column index levels. See the examples below to refer what each parameter means. >>> column_labels = pd.MultiIndex.from_tuples( ... [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"]) >>> row_index = pd.MultiIndex.from_tuples( ... [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')], ... names=["row_index_a", "row_index_b"]) >>> kdf = ks.DataFrame( ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels) >>> kdf.set_index(('a', 'x'), append=True, inplace=True) >>> kdf # doctest: +NORMALIZE_WHITESPACE column_labels_a a b column_labels_b y z row_index_a row_index_b (a, x) foo bar 1 2 3 4 5 6 zoo bar 7 8 9 >>> internal = kdf._internal >>> internal._sdf.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +-----------------+-----------------+------+------+------+... |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|... +-----------------+-----------------+------+------+------+... | foo| bar| 1| 2| 3|... | foo| bar| 4| 5| 6|... | zoo| bar| 7| 8| 9|... +-----------------+-----------------+------+------+------+... >>> internal._index_map # doctest: +NORMALIZE_WHITESPACE OrderedDict([('__index_level_0__', ('row_index_a',)), ('__index_level_1__', ('row_index_b',)), ('(a, x)', ('a', 'x'))]) >>> internal._column_labels [('a', 'y'), ('b', 'z')] >>> internal._data_spark_columns [Column<b'(a, y)'>, Column<b'(b, z)'>] >>> internal._column_label_names [('column_labels_a',), ('column_labels_b',)] """ assert isinstance(spark_frame, spark.DataFrame) assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming." if index_map is None: assert not any( SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid " "index column names [%s]." % SPARK_INDEX_NAME_PATTERN) if data_spark_columns is not None: spark_frame = spark_frame.select(data_spark_columns) # Create default index. spark_frame = InternalFrame.attach_default_index(spark_frame) index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}) if data_spark_columns is not None: data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col != SPARK_DEFAULT_INDEX_NAME ] if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns: spark_frame = spark_frame.withColumn( NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()) assert isinstance(index_map, OrderedDict), index_map assert all( isinstance(index_field, str) and ( index_name is None or (isinstance(index_name, tuple) and all( name is None or as_spark_type(type(name)) is not None for name in index_name))) for index_field, index_name in index_map.items()), index_map assert data_spark_columns is None or all( isinstance(scol, spark.Column) for scol in data_spark_columns) self._sdf = spark_frame # type: spark.DataFrame self._index_map = index_map # type: Dict[str, Optional[Tuple]] if data_spark_columns is None: index_columns = set(index_column for index_column in self._index_map) self._data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col not in index_columns and col not in HIDDEN_COLUMNS ] else: self._data_spark_columns = data_spark_columns if column_labels is None: self._column_labels = [ (col, ) for col in spark_frame.select(self._data_spark_columns).columns ] # type: List[Tuple] else: assert len(column_labels) == len(self._data_spark_columns), ( len(column_labels), len(self._data_spark_columns), ) if len(column_labels) == 1: column_label = column_labels[0] assert column_label is None or (isinstance( column_label, tuple) and len(column_label) > 0 and all( label is None or as_spark_type(type(label)) is not None for label in column_label)), column_label else: assert all( isinstance(column_label, tuple) and len(column_label) > 0 and all( label is None or as_spark_type(type(label)) is not None for label in column_label) for column_label in column_labels), column_labels assert len(set(len(label) for label in column_labels)) <= 1, column_labels self._column_labels = column_labels if column_label_names is None: self._column_label_names = [None] * column_labels_level( self._column_labels) # type: List[Optional[Tuple[str, ...]]] else: if len(self._column_labels) > 0: assert len(column_label_names) == column_labels_level( self._column_labels), ( len(column_label_names), column_labels_level(self._column_labels), ) else: assert len(column_label_names) > 0, len(column_label_names) assert all(column_label_name is None or ( isinstance(column_label_name, tuple) and all( name is None or as_spark_type(type(name)) is not None for name in column_label_name)) for column_label_name in column_label_names), column_label_names self._column_label_names = column_label_names