def pandas_df(self): """ Return as pandas DataFrame. """ sdf = self.spark_internal_df pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: pdf = pdf.astype({field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}) index_columns = self.index_columns if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in self.data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[[col if col in index_columns else str(i) if idx is None else name_like_string(idx) for i, (col, idx) in enumerate(zip(self.data_columns, self.column_index))]] if self.column_index_level > 1: pdf.columns = pd.MultiIndex.from_tuples(self._column_index) else: pdf.columns = [None if idx is None else idx[0] for idx in self._column_index] if self._column_index_names is not None: pdf.columns.names = self._column_index_names index_names = self.index_names if len(index_names) > 0: pdf.index.names = [name if name is None or len(name) > 1 else name[0] for name in index_names] return pdf
def to_pandas_frame(self) -> pd.DataFrame: """ Return as pandas DataFrame. """ sdf = self.to_internal_spark_frame pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: pdf = pdf.astype({ field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema }) column_names = [] for i, (label, spark_column, column_name) in enumerate( zip(self.column_labels, self.data_spark_columns, self.data_spark_column_names)): for index_spark_column_name, index_spark_column in zip( self.index_spark_column_names, self.index_spark_columns): if spark_column._jc.equals(index_spark_column._jc): column_names.append(index_spark_column_name) break else: name = str(i) if label is None else name_like_string(label) if column_name != name: column_name = name column_names.append(column_name) append = False for index_field in self.index_spark_column_names: drop = index_field not in column_names pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[column_names] names = [ name if name is None or len(name) > 1 else name[0] for name in self._column_label_names ] if self.column_labels_level > 1: pdf.columns = pd.MultiIndex.from_tuples(self._column_labels, names=names) else: pdf.columns = pd.Index( [ None if label is None else label[0] for label in self._column_labels ], name=names[0], ) index_names = self.index_names if len(index_names) > 0: pdf.index.names = [ name if name is None or len(name) > 1 else name[0] for name in index_names ] return pdf
def dtype(self): """Return the dtype object of the underlying data. Examples -------- >>> s = ks.Series([1, 2, 3]) >>> s.dtype dtype('int64') >>> s = ks.Series(list('abc')) >>> s.dtype dtype('O') >>> s = ks.Series(pd.date_range('20130101', periods=3)) >>> s.dtype dtype('<M8[ns]') >>> s.rename("a").to_frame().set_index("a").index.dtype dtype('<M8[ns]') """ return spark_type_to_pandas_dtype(self.spark.data_type)
def to_pandas_frame(self) -> pd.DataFrame: """ Return as pandas DataFrame. """ sdf = self.to_internal_spark_frame pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: pdf = pdf.astype({ field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema }) elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"): for field in sdf.schema: if field.nullable and pdf[field.name].isnull().all(): if isinstance(field.dataType, BooleanType): pdf[field.name] = pdf[field.name].astype(np.object) elif isinstance(field.dataType, IntegralType): pdf[field.name] = pdf[field.name].astype(np.float64) else: pdf[field.name] = pdf[field.name].astype( spark_type_to_pandas_dtype(field.dataType)) column_names = [] for i, (label, spark_column, column_name) in enumerate( zip(self.column_labels, self.data_spark_columns, self.data_spark_column_names)): for index_spark_column_name, index_spark_column in zip( self.index_spark_column_names, self.index_spark_columns): if spark_column._jc.equals(index_spark_column._jc): column_names.append(index_spark_column_name) break else: name = str(i) if label is None else name_like_string(label) if column_name != name: column_name = name column_names.append(column_name) append = False for index_field in self.index_spark_column_names: drop = index_field not in column_names pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[column_names] names = [ name if name is None or len(name) > 1 else name[0] for name in self._column_label_names ] if self.column_labels_level > 1: pdf.columns = pd.MultiIndex.from_tuples(self._column_labels, names=names) else: pdf.columns = pd.Index( [ None if label is None else label[0] for label in self._column_labels ], name=names[0], ) pdf.index.names = [ name if name is None or len(name) > 1 else name[0] for name in self.index_names ] return pdf