Exemple #1
0
    def pandas_df(self):
        """ Return as pandas DataFrame. """
        sdf = self.spark_internal_df
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({field.name: spark_type_to_pandas_dtype(field.dataType)
                              for field in sdf.schema})

        index_columns = self.index_columns
        if len(index_columns) > 0:
            append = False
            for index_field in index_columns:
                drop = index_field not in self.data_columns
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[[col if col in index_columns
                       else str(i) if idx is None else name_like_string(idx)
                       for i, (col, idx) in enumerate(zip(self.data_columns, self.column_index))]]

        if self.column_index_level > 1:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_index)
        else:
            pdf.columns = [None if idx is None else idx[0] for idx in self._column_index]
        if self._column_index_names is not None:
            pdf.columns.names = self._column_index_names

        index_names = self.index_names
        if len(index_names) > 0:
            pdf.index.names = [name if name is None or len(name) > 1 else name[0]
                               for name in index_names]
        return pdf
Exemple #2
0
    def to_pandas_frame(self) -> pd.DataFrame:
        """ Return as pandas DataFrame. """
        sdf = self.to_internal_spark_frame
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({
                field.name: spark_type_to_pandas_dtype(field.dataType)
                for field in sdf.schema
            })

        column_names = []
        for i, (label, spark_column, column_name) in enumerate(
                zip(self.column_labels, self.data_spark_columns,
                    self.data_spark_column_names)):
            for index_spark_column_name, index_spark_column in zip(
                    self.index_spark_column_names, self.index_spark_columns):
                if spark_column._jc.equals(index_spark_column._jc):
                    column_names.append(index_spark_column_name)
                    break
            else:
                name = str(i) if label is None else name_like_string(label)
                if column_name != name:
                    column_name = name
                column_names.append(column_name)

        append = False
        for index_field in self.index_spark_column_names:
            drop = index_field not in column_names
            pdf = pdf.set_index(index_field, drop=drop, append=append)
            append = True
        pdf = pdf[column_names]

        names = [
            name if name is None or len(name) > 1 else name[0]
            for name in self._column_label_names
        ]
        if self.column_labels_level > 1:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_labels,
                                                    names=names)
        else:
            pdf.columns = pd.Index(
                [
                    None if label is None else label[0]
                    for label in self._column_labels
                ],
                name=names[0],
            )

        index_names = self.index_names
        if len(index_names) > 0:
            pdf.index.names = [
                name if name is None or len(name) > 1 else name[0]
                for name in index_names
            ]
        return pdf
Exemple #3
0
    def dtype(self):
        """Return the dtype object of the underlying data.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3])
        >>> s.dtype
        dtype('int64')

        >>> s = ks.Series(list('abc'))
        >>> s.dtype
        dtype('O')

        >>> s = ks.Series(pd.date_range('20130101', periods=3))
        >>> s.dtype
        dtype('<M8[ns]')

        >>> s.rename("a").to_frame().set_index("a").index.dtype
        dtype('<M8[ns]')
        """
        return spark_type_to_pandas_dtype(self.spark.data_type)
Exemple #4
0
    def to_pandas_frame(self) -> pd.DataFrame:
        """ Return as pandas DataFrame. """
        sdf = self.to_internal_spark_frame
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({
                field.name: spark_type_to_pandas_dtype(field.dataType)
                for field in sdf.schema
            })
        elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
            for field in sdf.schema:
                if field.nullable and pdf[field.name].isnull().all():
                    if isinstance(field.dataType, BooleanType):
                        pdf[field.name] = pdf[field.name].astype(np.object)
                    elif isinstance(field.dataType, IntegralType):
                        pdf[field.name] = pdf[field.name].astype(np.float64)
                    else:
                        pdf[field.name] = pdf[field.name].astype(
                            spark_type_to_pandas_dtype(field.dataType))

        column_names = []
        for i, (label, spark_column, column_name) in enumerate(
                zip(self.column_labels, self.data_spark_columns,
                    self.data_spark_column_names)):
            for index_spark_column_name, index_spark_column in zip(
                    self.index_spark_column_names, self.index_spark_columns):
                if spark_column._jc.equals(index_spark_column._jc):
                    column_names.append(index_spark_column_name)
                    break
            else:
                name = str(i) if label is None else name_like_string(label)
                if column_name != name:
                    column_name = name
                column_names.append(column_name)

        append = False
        for index_field in self.index_spark_column_names:
            drop = index_field not in column_names
            pdf = pdf.set_index(index_field, drop=drop, append=append)
            append = True
        pdf = pdf[column_names]

        names = [
            name if name is None or len(name) > 1 else name[0]
            for name in self._column_label_names
        ]
        if self.column_labels_level > 1:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_labels,
                                                    names=names)
        else:
            pdf.columns = pd.Index(
                [
                    None if label is None else label[0]
                    for label in self._column_labels
                ],
                name=names[0],
            )

        pdf.index.names = [
            name if name is None or len(name) > 1 else name[0]
            for name in self.index_names
        ]

        return pdf