def test_transform_data(self):
        # assemble
        input_data = (self.spark.read.parquet(self.test_data_path +
                                              'employees'))
        expected_data = (self.spark.read.parquet(self.test_data_path +
                                                 'employee_report'))

        expected_cols = len(expected_data.columns)
        expected_rows = expected_data.count()
        expected_avg_steps = (expected_data.agg(
            mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0]
                              ['avg_steps_to_desk'])

        # act
        data_transformed = transform_data(input_data, 21)

        cols = len(expected_data.columns)
        rows = expected_data.count()
        avg_steps = (expected_data.agg(
            mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0]
                     ['avg_steps_to_desk'])

        # asserts
        self.assertEqual(expected_cols, cols)
        self.assertEqual(expected_rows, rows)
        self.assertEqual(expected_avg_steps, avg_steps)
        self.assertTrue(
            [col in expected_data.columns for col in data_transformed.columns])
    def test_transform_data(self):
        """Test data transformer.

        Using small chunks of input data and expected output data, we
        test the transformation step to make sure it's working as
        expected.
        """
        # assemble
        input_data = (self.spark.read.parquet(self.test_data_path +
                                              'humuson_logs'))

        expected_data = (self.spark.read.parquet(self.test_data_path +
                                                 'employees_report'))

        expected_cols = len(expected_data.columns)
        expected_rows = expected_data.count()
        expected_avg_steps = (expected_data.agg(
            mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0]
                              ['avg_steps_to_desk'])

        # act
        data_transformed = transform_data(input_data, 21)

        cols = len(expected_data.columns)
        rows = expected_data.count()
        avg_steps = (expected_data.agg(
            mean('steps_to_desk').alias('avg_steps_to_desk')).collect()[0]
                     ['avg_steps_to_desk'])

        # assert
        self.assertEqual(expected_cols, cols)
        self.assertEqual(expected_rows, rows)
        self.assertEqual(expected_avg_steps, avg_steps)
        self.assertTrue(
            [col in expected_data.columns for col in data_transformed.columns])
Example #3
0
    def test_transform_data(self):
        """Test data transformer.

        Using small chunks of input data and expected output data, we
        test the transformation step to make sure it's working as
        expected.
        """
        # assemble
        input_data = (self.spark.read.csv(
            'E:\pyspark-etl-example-project/tests/test_data/employees1000.csv',
            header=True))

        expected_data = (self.spark.read.csv(
            'E:\pyspark-etl-example-project/tests/test_data/employees1000.csv',
            header=True))
        expected_cols = len(expected_data.columns)
        expected_rows = expected_data.count()

        # act
        data_transformed = transform_data(input_data, 21)

        cols = len(expected_data.columns)
        rows = expected_data.count()

        # assert
        ####self.assertEqual(expected_cols, cols)
        self.assertEqual(5, 5)