Beispiel #1
0
    def test_should_return_same_df_when_columns_param_is_empty(self):
        transformation = Transformation(self.test_data)
        transformation.split_content("", [])

        current_result = transformation.dataframe.columns
        expected_result = self.test_data.columns

        self.assertEqual(current_result, expected_result)
Beispiel #2
0
    def test_should_return_same_df_when_columns_param_is_empty(self):
        transformation = Transformation(self.test_data)
        transformation.replace_content("mag", {})

        current_result = transformation.dataframe.collect()
        expected_result = self.test_data.collect()

        self.assertEqual(current_result, expected_result)
Beispiel #3
0
    def test_should_return_same_df_when_column_not_exists(self):
        transformation = Transformation(self.test_data)
        transformation.split_content("time", ["day", "month", "year"])

        current_result = transformation.dataframe.columns
        expected_result = self.test_data.columns

        self.assertEqual(current_result, expected_result)
Beispiel #4
0
    def test_should_return_same_df_when_columns_params_is_empty(self):
        transformation = Transformation(self.test_data)
        transformation.convert_data_type({})

        current_result = transformation.dataframe.dtypes
        expected_result = self.test_data.dtypes

        self.assertEqual(current_result, expected_result)
Beispiel #5
0
    def test_should_return_same_df_when_columns_not_exists(self):
        transformation = Transformation(self.test_data)
        transformation.convert_data_type({"dt": DateType()})

        current_result = transformation.dataframe.dtypes
        expected_result = self.test_data.dtypes

        self.assertEqual(current_result, expected_result)
Beispiel #6
0
    def test_should_return_same_df_when_columns_not_exists_in_df(self):
        transformation = Transformation(self.test_data)
        transformation.replace_null_values({"magnitude": 0})

        current_result = transformation.dataframe.collect()
        expected_result = self.test_data.collect()

        self.assertEqual(current_result, expected_result)
Beispiel #7
0
    def test_should_return_same_columns_when_column_not_exist_in_df(self):
        transformation = Transformation(self.test_data)
        transformation.rename({"dt": "date"})

        current_result = transformation.dataframe.columns
        expected_result = self.test_data.columns

        self.assertEqual(current_result, expected_result)
Beispiel #8
0
    def test_should_return_same_columns_when_column_param_is_empty(self):
        transformation = Transformation(self.test_data)
        transformation.rename({})

        current_result = transformation.dataframe.columns
        expected_result = self.test_data.columns

        self.assertEqual(current_result, expected_result)
Beispiel #9
0
    def test_should_replace_two_columns_name(self):
        transformation = Transformation(self.test_data)
        transformation.rename({"mag": "magnitude", "status": "new_status"})

        current_result = transformation.dataframe.columns
        expected_result = [
            "date", "place", "magnitude", "new_status", "coordinates", "alert"
        ]

        self.assertEqual(current_result, expected_result)
Beispiel #10
0
    def test_should_convert_data_type_one_column(self):
        transformation = Transformation(self.test_data)
        transformation.convert_data_type({"mag": IntegerType()})

        current_result = transformation.dataframe.dtypes
        expected_result = [("date", "bigint"), ("place", "string"),
                           ("mag", "int"), ("status", "string"),
                           ("coordinates", "array<double>"),
                           ("alert", "string")]

        self.assertEqual(current_result, expected_result)
Beispiel #11
0
    def test_should_split_column_content_into_three_new_columns(self):
        transformation = Transformation(self.test_data)
        transformation.split_content("coordinates",
                                     ["longitude", "latitude", "depth"])

        current_result = transformation.dataframe.columns
        expected_result = [
            "date", "place", "mag", "status", "longitude", "latitude", "depth",
            "alert"
        ]

        self.assertCountEqual(current_result, expected_result)
Beispiel #12
0
    def test_should_convert_data_type_two_columns_when_one_column_name_not_exists(
            self):
        transformation = Transformation(self.test_data)
        transformation.convert_data_type({
            "mag": IntegerType(),
            "coordinates": ArrayType(StringType()),
            "date": TimestampType()
        })

        current_result = transformation.dataframe.dtypes
        expected_result = [("date", "timestamp"), ("place", "string"),
                           ("mag", "int"), ("status", "string"),
                           ("coordinates", "array<string>"),
                           ("alert", "string")]

        self.assertEqual(current_result, expected_result)
Beispiel #13
0
    def test_should_remove_two_columns_from_dataframe(self):
        transformation = Transformation(self.test_data)
        transformation.drop(["coordinates", "alert"])

        current_result = transformation.dataframe.columns
        expected_result = self.spark.createDataFrame(
            [(1704567252, "California", 0.82, "Automatic"),
             (1391707828, "Alaska", 1.1, None),
             (1435498694, "Chile", 4.9, "Reviewed"),
             (1609879110, "Hawaii", 2.0099, "Automatic"),
             (1224994646, "Indonesia", 4.8, "Reviewed"),
             (1801059964, "Nevada", 0.5, "Automatic"),
             (1262739669, "Arkansas", 1.9, "Reviewed"),
             (1890118874, "Montana", 1.33, "Reviewed"),
             (1025727100, "Oklahoma", 1.58, "Reviewed"),
             (1834567116, "Idaho", 2.6, "Reviewed")],
            ["date", "place", "mag", "status"]).columns

        self.assertEqual(current_result, expected_result)
    def test_should_return_transformed_data_using_all_pipeline_components(
            self, mock_get_data):
        self.create_tmp_folder()

        fake_api_input = ApiInput(self.FAKE_URL)
        mock_get_data.return_value = self.FAKE_INPUT_DATA

        extraction_process = Extraction(fake_api_input)
        extraction_process.extract()

        raw_data = extraction_process.data
        raw_df = self.spark.createDataFrame(
            raw_data,
            ["date", "place", "mag", "status", "coordinates", "alert"])

        transformation_process = Transformation(raw_df)
        transformation_process.drop(["alert"])
        transformation_process.rename({"mag": "magnitude", "place": "city"})
        transformation_process.replace_null_values({"status": "Automatic"})
        transformation_process.lowercase(["status"])
        transformation_process.convert_data_type({"date": IntegerType()})
        transformation_process.split_content(
            "coordinates", ["longitude", "latitude", "depth"])
        transformed_df = transformation_process.dataframe

        csv_storage = CsvStorage(self.OUTPUT_FILEPATH)
        loading_process = Loading(csv_storage)
        loading_process.load(transformed_df)

        current_result = self.spark \
            .read \
            .csv(self.OUTPUT_FILEPATH, header=True, inferSchema=True) \
            .collect()
        expected_result = self.spark \
            .createDataFrame(self.FAKE_EXPECTED_DATA,
                             ["date", "city", "magnitude", "status", "longitude", "latitude", "depth"]) \
            .collect()

        self.assertEqual(current_result, expected_result)

        self.delete_test_file()