Example #1
0
def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame:
    """
    The correlation matrix of all the numerical columns of this dataframe.

    Only accepts scalar numerical values for now.

    :param kdf: the koalas dataframe.
    :param method: {'pearson', 'spearman'}
                   * pearson : standard correlation coefficient
                   * spearman : Spearman rank correlation
    :return: :class:`pandas.DataFrame`

    >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
         A    B
    A  1.0 -1.0
    B -1.0  1.0
    """
    assert method in ('pearson', 'spearman')
    ndf, column_index = to_numeric_df(kdf)
    corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
    pcorr = corr.toPandas()
    arr = pcorr.iloc[0, 0].toArray()
    if column_index_level(column_index) > 1:
        idx = pd.MultiIndex.from_tuples(column_index)
    else:
        idx = pd.Index([idx[0] for idx in column_index])
    return pd.DataFrame(arr, columns=idx, index=idx)
Example #2
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond, limit = self._select_rows(rows_sel)
        column_index, columns, returns_series = self._select_cols(cols_sel)

        if cond is None and limit is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)
                if limit is not None:
                    if limit >= 0:
                        sdf = sdf.limit(limit)
                    else:
                        sdf = sdf.limit(sdf.count() + limit)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf
Example #3
0
 def column_index_level(self) -> int:
     """ Return the level of the column index. """
     return column_index_level(self._column_index)
Example #4
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf['__temp_col__'] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf['__temp_col__']]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_index = self._internal.column_index
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf['__temp_col__'] = rows_sel
                return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_index, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(self._internal.copy(
                    scol=column_scols[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)

        if remaining_index is not None:
            index_scols = self._internal.index_scols[-remaining_index:]
            index_map = self._internal.index_map[-remaining_index:]
        else:
            index_scols = self._internal.index_scols
            index_map = self._internal.index_map

        if self._internal.column_index_names is None:
            column_index_names = None
        else:
            # Manage column index names
            level = column_index_level(column_index)
            column_index_names = self._internal.column_index_names[-level:]

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(index_scols + column_scols)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in column_scols]))

        internal = _InternalFrame(sdf=sdf,
                                  index_map=index_map,
                                  column_index=column_index,
                                  column_index_names=column_index_names)
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
Example #5
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond = self._select_rows(rows_sel)

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise LocIndexer._raiseNotImplemented(
                "Can only select columns either by name or reference or all")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None

        returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column)
        if cols_sel is None:
            column_index = self._internal.column_index
            columns = self._internal.column_scols
        elif isinstance(cols_sel, (str, tuple)):
            if isinstance(cols_sel, str):
                cols_sel = (cols_sel,)
            column_index, columns, returns_series = \
                self._get_from_multiindex_column(cols_sel)
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
            column_index = None
        elif all(isinstance(key, Series) for key in cols_sel):
            columns = [_make_col(key) for key in cols_sel]
            column_index = [key._internal.column_index[0] for key in cols_sel]
        elif all(isinstance(key, spark.Column) for key in cols_sel):
            columns = cols_sel
            column_index = None
        elif (any(isinstance(key, str) for key in cols_sel)
              and any(isinstance(key, tuple) for key in cols_sel)):
            raise TypeError('Expected tuple, got str')
        else:
            if all(isinstance(key, tuple) for key in cols_sel):
                level = self._internal.column_index_level
                if any(len(key) != level for key in cols_sel):
                    raise ValueError('All the key level should be the same as column index level.')

            column_to_index = list(zip(self._internal.data_columns,
                                       self._internal.column_index))
            columns = []
            column_index = []
            for key in cols_sel:
                found = False
                for column, idx in column_to_index:
                    if idx == key or idx[0] == key:
                        columns.append(_make_col(column))
                        column_index.append(idx)
                        found = True
                if not found:
                    raise KeyError("['{}'] not in index".format(key))

        if cond is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf