def test_transform_data(self): # assemble input_data = (self.spark.read.parquet(self.test_data_path + 'employees')) expected_data = (self.spark.read.parquet(self.test_data_path + 'employee_report')) expected_cols = len(expected_data.columns) expected_rows = expected_data.count() expected_avg_steps = (expected_data.agg( mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0] ['avg_steps_to_desk']) # act data_transformed = transform_data(input_data, 21) cols = len(expected_data.columns) rows = expected_data.count() avg_steps = (expected_data.agg( mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0] ['avg_steps_to_desk']) # asserts self.assertEqual(expected_cols, cols) self.assertEqual(expected_rows, rows) self.assertEqual(expected_avg_steps, avg_steps) self.assertTrue( [col in expected_data.columns for col in data_transformed.columns])
def test_transform_data(self): """Test data transformer. Using small chunks of input data and expected output data, we test the transformation step to make sure it's working as expected. """ # assemble input_data = (self.spark.read.parquet(self.test_data_path + 'humuson_logs')) expected_data = (self.spark.read.parquet(self.test_data_path + 'employees_report')) expected_cols = len(expected_data.columns) expected_rows = expected_data.count() expected_avg_steps = (expected_data.agg( mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0] ['avg_steps_to_desk']) # act data_transformed = transform_data(input_data, 21) cols = len(expected_data.columns) rows = expected_data.count() avg_steps = (expected_data.agg( mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0] ['avg_steps_to_desk']) # assert self.assertEqual(expected_cols, cols) self.assertEqual(expected_rows, rows) self.assertEqual(expected_avg_steps, avg_steps) self.assertTrue( [col in expected_data.columns for col in data_transformed.columns])
def test_transform_data(self): """Test data transformer. Using small chunks of input data and expected output data, we test the transformation step to make sure it's working as expected. """ # assemble input_data = (self.spark.read.csv( 'E:\pyspark-etl-example-project/tests/test_data/employees1000.csv', header=True)) expected_data = (self.spark.read.csv( 'E:\pyspark-etl-example-project/tests/test_data/employees1000.csv', header=True)) expected_cols = len(expected_data.columns) expected_rows = expected_data.count() # act data_transformed = transform_data(input_data, 21) cols = len(expected_data.columns) rows = expected_data.count() # assert ####self.assertEqual(expected_cols, cols) self.assertEqual(5, 5)