def dropna(self, axis=0, inplace=False, **kwargs): col = _col(self.to_dataframe().dropna(axis=axis, inplace=False)) if inplace: anchor_wrap(col, self) self._jc = col._jc self._pandas_schema = None self._pandas_metadata = None else: return col
def _pd_getitem(self, key): if key is None: raise KeyError("none key") if isinstance(key, string_types): try: return self._spark_getitem(key) except AnalysisException: raise KeyError(key) if np.isscalar(key) or isinstance(key, (tuple, string_types)): raise NotImplementedError(key) elif isinstance(key, slice): return self.loc[key] if isinstance(key, (pd.Series, np.ndarray, pd.Index)): raise NotImplementedError(key) if isinstance(key, list): return self.loc[:, key] if isinstance(key, DataFrame): # TODO Should not implement alignment, too dangerous? return self._spark_getitem(key) if isinstance(key, Column): # TODO Should not implement alignment, too dangerous? # It is assumed to be only a filter, otherwise .loc should be used. bcol = key.cast("boolean") df = self._spark_filter(bcol) df._metadata = self._metadata return anchor_wrap(self, df) raise NotImplementedError(key)
def getField(self, name): if not isinstance(self.schema, StructType): raise AttributeError("Not a struct: {}".format(self.schema)) else: fnames = self.schema.fieldNames() if name not in fnames: raise AttributeError( "Field {} not found, possible values are {}".format(name, ", ".join(fnames))) return anchor_wrap(self, self._spark_getField(name))
def rename(self, index=None, **kwargs): if index is None: return self col = self._spark_alias(index) if kwargs.get('inplace', False): self._jc = col._jc self._pandas_schema = None self._pandas_metadata = None return self else: return anchor_wrap(self, col)
def reset_index(self, level=None, drop=False, name=None, inplace=False): if inplace and not drop: raise TypeError('Cannot reset_index inplace on a Series to create a DataFrame') if name is not None: df = self.rename(name).to_dataframe() else: df = self.to_dataframe() df = df.reset_index(level=level, drop=drop) if drop: col = _col(df) if inplace: anchor_wrap(col, self) self._jc = col._jc self._pandas_schema = None self._pandas_metadata = None else: return col else: return df
def astype(self, dtype): from databricks.koalas.typing import as_spark_type spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) return anchor_wrap(self, self._spark_cast(spark_type))
def __invert__(self): return anchor_wrap(self, self.astype(bool) == F._spark_lit(False))
def __getattr__(self, item): if item.startswith("__") or item.startswith("_pandas_") or item.startswith("_spark_"): raise AttributeError(item) return anchor_wrap(self, self.getField(item))
def __getitem__(self, key): return anchor_wrap(self, self._spark_getitem(key))
def isnull(self): if isinstance(self.schema[self.name].dataType, (FloatType, DoubleType)): return anchor_wrap(self, self._spark_isNull() | F._spark_isnan(self)) else: return anchor_wrap(self, self._spark_isNull())
def _index_columns(self): return [ anchor_wrap(self, self._spark_getitem(field)) for field in self._metadata.index_fields ]
def __getattr__(self, key): if key.startswith("__") or key.startswith( "_pandas_") or key.startswith("_spark_"): raise AttributeError(key) return anchor_wrap(self, self._spark_getattr(key))
def get(self, key, default=None): try: return anchor_wrap(self, self._pd_getitem(key)) except (KeyError, ValueError, IndexError): return default