def test_set_rdd(self): first_dataset = self.context.spark.createDataFrame([['a']], 'column1: string') second_dataset = self.context.spark.sparkContext.parallelize( [Row(column1='aa')]) cardo_dataframe = CardoDataFrame(first_dataset, '6') cardo_dataframe.rdd = second_dataset self.assertItemsEqual(second_dataset.collect(), cardo_dataframe.dataframe.collect()) self.assertItemsEqual(second_dataset.collect(), cardo_dataframe.rdd.collect())
def test_unpersist_rdd(self): # Arrange rdd = self.context.spark.sparkContext.parallelize([Row(column1='aa')]) second_rdd = self.context.spark.sparkContext.parallelize( [Row(column1='bb')]) cardo_dataframe = CardoDataFrame(rdd, '') cardo_dataframe.persist() cardo_dataframe.rdd = second_rdd # Act cardo_dataframe.unpersist() # Assert self.assertFalse(rdd.is_cached)