def test_should_transform_many_colums_in_lowercase(self): transformation = Transformation(self.test_data) transformation.lowercase(["place", "status"]) transformed_df = transformation.dataframe.collect() expected_result = self.spark.createDataFrame( [(1704567252, "california", 0.82, "automatic", [-116.8, 33.3333333, 12.04], None), (1391707828, "alaska", 1.1, None, [-148.942, 64.9081, 10.6 ], "green"), (1435498694, "chile", 4.9, "reviewed", [-70.6202, -21.4265, 52.24], None), (1609879110, "hawaii", 2.0099, "automatic", [ -155.429000854492, 19.2180004119873, 33.2999992370605 ], "yellow"), (1224994646, "indonesia", 4.8, "reviewed", [126.419, 0.2661, 10 ], "green"), (1801059964, "nevada", 0.5, "automatic", [-116.242, 36.7564, 0.8 ], None), (1262739669, "arkansas", 1.9, "reviewed", [-91.4295, 35.863, 16.41], "green"), (1890118874, "montana", 1.33, "reviewed", [-110.434, 44.4718333, 2.21], None), (1025727100, "oklahoma", 1.58, "reviewed", [-98.53233333, 36.57083333, 6.31], None), (1834567116, "idaho", 2.6, "reviewed", [-115.186, 44.2666, 10 ], "green")], ["date", "place", "mag", "status", "coordinates", "alert" ]).collect() self.assertEqual(transformed_df, expected_result)
def test_should_have_same_df_when_column_param_is_empty(self): transformation = Transformation(self.test_data) transformation.lowercase([]) transformed_df = transformation.dataframe.collect() self.assertEqual(transformed_df, self.test_data.collect())
def test_should_have_same_df_when_column_not_exist_in_dataframe(self): transformation = Transformation(self.test_data) transformation.lowercase(["xpto"]) transformed_df = transformation.dataframe.collect() self.assertEqual(transformed_df, self.test_data.collect())
def test_should_return_transformed_data_using_all_pipeline_components( self, mock_get_data): self.create_tmp_folder() fake_api_input = ApiInput(self.FAKE_URL) mock_get_data.return_value = self.FAKE_INPUT_DATA extraction_process = Extraction(fake_api_input) extraction_process.extract() raw_data = extraction_process.data raw_df = self.spark.createDataFrame( raw_data, ["date", "place", "mag", "status", "coordinates", "alert"]) transformation_process = Transformation(raw_df) transformation_process.drop(["alert"]) transformation_process.rename({"mag": "magnitude", "place": "city"}) transformation_process.replace_null_values({"status": "Automatic"}) transformation_process.lowercase(["status"]) transformation_process.convert_data_type({"date": IntegerType()}) transformation_process.split_content( "coordinates", ["longitude", "latitude", "depth"]) transformed_df = transformation_process.dataframe csv_storage = CsvStorage(self.OUTPUT_FILEPATH) loading_process = Loading(csv_storage) loading_process.load(transformed_df) current_result = self.spark \ .read \ .csv(self.OUTPUT_FILEPATH, header=True, inferSchema=True) \ .collect() expected_result = self.spark \ .createDataFrame(self.FAKE_EXPECTED_DATA, ["date", "city", "magnitude", "status", "longitude", "latitude", "depth"]) \ .collect() self.assertEqual(current_result, expected_result) self.delete_test_file()