Example #1
0
        def test_overwrite_static_partition(self, input_df, default_params,
                                            partition, full_table_name,
                                            spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            df_to_load = input_df.where(where_clause)

            count_pre_total = spark_session.table(full_table_name).count()
            count_to_load = df_to_load.count()
            count_post_total = input_df.count()
            assert (
                count_post_total == count_pre_total
            ), "Something went wrong in the test setup of the input DataFrame (input_df)"

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Example #2
0
        def test_add_new_static_partition_with_overwritten_partition_value(
                self, input_df, default_params, partition, full_table_name,
                spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["clear_partition"] = False
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            output_table = spark_session.table(full_table_name)

            count_pre_partition = output_table.where(where_clause).count()
            count_post_partition = input_df.count()
            count_post_total = input_df.count() * 2

            assert input_df.count() > 0, "Dataframe to load is empty!"
            loader.load(input_df)

            assert (
                output_table.count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"

            assert (
                output_table.where(where_clause).count() == input_df.count() +
                count_pre_partition
            ), "test partition was not successfully loaded to output hive table"
Example #3
0
        def test_append_to_static_partition(self, input_df, default_params,
                                            partition, full_table_name,
                                            spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["clear_partition"] = False
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            #
            df_to_load = input_df.where(where_clause)

            count_pre_total = spark_session.table(full_table_name).count()
            count_to_load = df_to_load.count()
            count_post_total = count_pre_total + count_to_load

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Example #4
0
        def test_add_new_static_partition(self, input_df, default_params,
                                          partition, full_table_name,
                                          spark_session):
            default_params["partition_definitions"][0][
                "default_value"] = partition
            loader = HiveLoader(**default_params)
            partition_query = construct_partition_query(
                default_params["partition_definitions"])
            inverted_partition_query = partition_query.replace(
                "=", "!=").replace(", ", " and ")
            df_to_load = input_df.where(partition_query)

            count_pre_total = input_df.where(inverted_partition_query).count()
            count_to_load = df_to_load.count()
            count_post_total = input_df.count()
            assert (
                count_post_total == count_pre_total + count_to_load
            ), "Something went wrong in the test setup of the input dataframe (input_df)"

            spark_session.sql(
                "alter table {tbl} drop partition ({part_def})".format(
                    tbl=full_table_name, part_def=partition_query))
            assert (
                spark_session.table(full_table_name).count() == count_pre_total
            ), "test partition was not successfully dropped from output hive table"

            assert df_to_load.count() > 0, "Dataframe to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Example #5
0
 def test_default_value_is_missing(self, default_params, input_df):
     default_params["partition_definitions"][1].pop("default_value")
     with pytest.raises(AssertionError) as excinfo:
         loader = HiveLoader(**default_params)
         loader.load(input_df)
     assert "No default partition value set for partition column" in str(
         excinfo.value)
Example #6
0
 def test_default_value_is_empty(self, default_value, default_params,
                                 input_df):
     default_params["partition_definitions"][0][
         "default_value"], default_params["partition_definitions"][0][
             "default_value"] = 3, default_value
     with pytest.raises(AssertionError) as excinfo:
         loader = HiveLoader(**default_params)
         loader.load(input_df)
     assert "No default partition value set for partition column" in str(
         excinfo.value)
Example #7
0
 def test_column_name_is_missing(self, default_params):
     default_params["partition_definitions"][0][
         "column_name"], default_params["partition_definitions"][1][
             "column_name"] = None, "f"
     with pytest.raises(AssertionError) as excinfo:
         HiveLoader(**default_params)
     assert "No column name set!" in str(excinfo.value)
Example #8
0
 def test_input_is_not_a_list(self, partition_definitions,
                              default_params):
     default_params["partition_definitions"] = partition_definitions
     with pytest.raises(AssertionError) as excinfo:
         HiveLoader(**default_params)
     assert "partition_definitions has to be a list containing dicts" in str(
         excinfo.value)
Example #9
0
        def test_clear_partition(self, spark_session, input_df, partition,
                                 default_params, full_table_name):
            """Partition is dropped"""
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            loader = HiveLoader(**default_params)
            partition_query = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            inverted_partition_query = partition_query.replace("=", "!=")
            expected_count = input_df.where(inverted_partition_query).count()
            loader._clear_hive_partition()
            actual_count = spark_session.table(full_table_name).count()

            assert actual_count == expected_count
Example #10
0
 def test_list_input_contains_non_dict_items(self,
                                             partition_definitions,
                                             default_params):
     default_params["partition_definitions"] = [partition_definitions]
     with pytest.raises(AssertionError) as excinfo:
         HiveLoader(**default_params)
     assert "Items of partition_definitions have to be dictionaries" in str(
         excinfo.value)
Example #11
0
 def test_column_type_not_a_valid_spark_sql_type(
         self, data_type, default_params):
     default_params["partition_definitions"][0][
         "column_type"] = data_type
     with pytest.raises(AssertionError) as excinfo:
         HiveLoader(**default_params)
     assert "Not a valid (PySpark) datatype for the partition column" in str(
         excinfo.value)
Example #12
0
        def test_create_partitioned_table(self, input_df, default_params,
                                          partition, full_table_name,
                                          spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["auto_create_table"] = True
            loader = HiveLoader(**default_params)
            spark_session.sql("drop table if exists " + full_table_name)

            spark_session.catalog.setCurrentDatabase(default_params["db_name"])
            assert default_params["table_name"] not in [
                tbl.name for tbl in spark_session.catalog.listTables()
            ], "Test setup of database is not clean. Table already exists!"

            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            df_to_load = input_df.where(where_clause)

            count_to_load = df_to_load.count()

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)
            assert default_params["table_name"] in [
                tbl.name for tbl in spark_session.catalog.listTables()
            ], "Table was not created!"

            assert (
                spark_session.table(full_table_name).count() == count_to_load
            ), "test partition was not successfully loaded to automatically created output hive table"

            try:
                assert spark_session.sql("show partitions " +
                                         full_table_name).count() > 0
            except Py4JJavaError as e:
                raise AssertionError("Created table is not partitioned. " +
                                     str(e))
Example #13
0
def default_loader(default_params):
    return HiveLoader(**default_params)