def cache(self) -> "CachedDataFrame": """ Yields and caches the current DataFrame. The pandas-on-Spark DataFrame is yielded as a protected resource and its corresponding data is cached which gets uncached after execution goes of the context. If you want to specify the StorageLevel manually, use :meth:`DataFrame.spark.persist` See Also -------- DataFrame.spark.persist Examples -------- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df dogs cats 0 0.2 0.3 1 0.0 0.6 2 0.6 0.0 3 0.2 0.1 >>> with df.spark.cache() as cached_df: ... print(cached_df.count()) ... dogs 4 cats 4 dtype: int64 >>> df = df.spark.cache() >>> df.to_pandas().mean(axis=1) 0 0.25 1 0.30 2 0.30 3 0.15 dtype: float64 To uncache the dataframe, use `unpersist` function >>> df.spark.unpersist() """ from pyspark.pandas.frame import CachedDataFrame self._kdf._update_internal_frame( self._kdf._internal.resolved_copy, requires_same_anchor=False ) return CachedDataFrame(self._kdf._internal)
def persist( self, storage_level: StorageLevel = StorageLevel.MEMORY_AND_DISK ) -> "CachedDataFrame": """ Yields and caches the current DataFrame with a specific StorageLevel. If a StogeLevel is not given, the `MEMORY_AND_DISK` level is used by default like PySpark. The pandas-on-Spark DataFrame is yielded as a protected resource and its corresponding data is cached which gets uncached after execution goes of the context. See Also -------- DataFrame.spark.cache Examples -------- >>> import pyspark >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df dogs cats 0 0.2 0.3 1 0.0 0.6 2 0.6 0.0 3 0.2 0.1 Set the StorageLevel to `MEMORY_ONLY`. >>> with df.spark.persist(pyspark.StorageLevel.MEMORY_ONLY) as cached_df: ... print(cached_df.spark.storage_level) ... print(cached_df.count()) ... Memory Serialized 1x Replicated dogs 4 cats 4 dtype: int64 Set the StorageLevel to `DISK_ONLY`. >>> with df.spark.persist(pyspark.StorageLevel.DISK_ONLY) as cached_df: ... print(cached_df.spark.storage_level) ... print(cached_df.count()) ... Disk Serialized 1x Replicated dogs 4 cats 4 dtype: int64 If a StorageLevel is not given, it uses `MEMORY_AND_DISK` by default. >>> with df.spark.persist() as cached_df: ... print(cached_df.spark.storage_level) ... print(cached_df.count()) ... Disk Memory Serialized 1x Replicated dogs 4 cats 4 dtype: int64 >>> df = df.spark.persist() >>> df.to_pandas().mean(axis=1) 0 0.25 1 0.30 2 0.30 3 0.15 dtype: float64 To uncache the dataframe, use `unpersist` function >>> df.spark.unpersist() """ from pyspark.pandas.frame import CachedDataFrame self._kdf._update_internal_frame( self._kdf._internal.resolved_copy, requires_same_anchor=False ) return CachedDataFrame(self._kdf._internal, storage_level=storage_level)