Ejemplo n.º 1
0
        def test_append_to_static_partition(self, input_df, default_params,
                                            partition, full_table_name,
                                            spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["clear_partition"] = False
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            #
            df_to_load = input_df.where(where_clause)

            count_pre_total = spark_session.table(full_table_name).count()
            count_to_load = df_to_load.count()
            count_post_total = count_pre_total + count_to_load

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Ejemplo n.º 2
0
        def test_add_new_static_partition_with_overwritten_partition_value(
                self, input_df, default_params, partition, full_table_name,
                spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["clear_partition"] = False
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            output_table = spark_session.table(full_table_name)

            count_pre_partition = output_table.where(where_clause).count()
            count_post_partition = input_df.count()
            count_post_total = input_df.count() * 2

            assert input_df.count() > 0, "Dataframe to load is empty!"
            loader.load(input_df)

            assert (
                output_table.count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"

            assert (
                output_table.where(where_clause).count() == input_df.count() +
                count_pre_partition
            ), "test partition was not successfully loaded to output hive table"
Ejemplo n.º 3
0
        def test_overwrite_static_partition(self, input_df, default_params,
                                            partition, full_table_name,
                                            spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            loader = HiveLoader(**default_params)
            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            df_to_load = input_df.where(where_clause)

            count_pre_total = spark_session.table(full_table_name).count()
            count_to_load = df_to_load.count()
            count_post_total = input_df.count()
            assert (
                count_post_total == count_pre_total
            ), "Something went wrong in the test setup of the input DataFrame (input_df)"

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Ejemplo n.º 4
0
 def test_default_value_is_missing(self, default_params, input_df):
     default_params["partition_definitions"][1].pop("default_value")
     with pytest.raises(AssertionError) as excinfo:
         loader = HiveLoader(**default_params)
         loader.load(input_df)
     assert "No default partition value set for partition column" in str(
         excinfo.value)
Ejemplo n.º 5
0
        def test_add_new_static_partition(self, input_df, default_params,
                                          partition, full_table_name,
                                          spark_session):
            default_params["partition_definitions"][0][
                "default_value"] = partition
            loader = HiveLoader(**default_params)
            partition_query = construct_partition_query(
                default_params["partition_definitions"])
            inverted_partition_query = partition_query.replace(
                "=", "!=").replace(", ", " and ")
            df_to_load = input_df.where(partition_query)

            count_pre_total = input_df.where(inverted_partition_query).count()
            count_to_load = df_to_load.count()
            count_post_total = input_df.count()
            assert (
                count_post_total == count_pre_total + count_to_load
            ), "Something went wrong in the test setup of the input dataframe (input_df)"

            spark_session.sql(
                "alter table {tbl} drop partition ({part_def})".format(
                    tbl=full_table_name, part_def=partition_query))
            assert (
                spark_session.table(full_table_name).count() == count_pre_total
            ), "test partition was not successfully dropped from output hive table"

            assert df_to_load.count() > 0, "Dataframe to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)

            assert (
                spark_session.table(
                    full_table_name).count() == count_post_total
            ), "test partition was not successfully loaded to output hive table"
Ejemplo n.º 6
0
 def test_default_value_is_empty(self, default_value, default_params,
                                 input_df):
     default_params["partition_definitions"][0][
         "default_value"], default_params["partition_definitions"][0][
             "default_value"] = 3, default_value
     with pytest.raises(AssertionError) as excinfo:
         loader = HiveLoader(**default_params)
         loader.load(input_df)
     assert "No default partition value set for partition column" in str(
         excinfo.value)
Ejemplo n.º 7
0
        def test_create_partitioned_table(self, input_df, default_params,
                                          partition, full_table_name,
                                          spark_session):
            (
                default_params["partition_definitions"][0]["default_value"],
                default_params["partition_definitions"][1]["default_value"],
            ) = partition
            default_params["auto_create_table"] = True
            loader = HiveLoader(**default_params)
            spark_session.sql("drop table if exists " + full_table_name)

            spark_session.catalog.setCurrentDatabase(default_params["db_name"])
            assert default_params["table_name"] not in [
                tbl.name for tbl in spark_session.catalog.listTables()
            ], "Test setup of database is not clean. Table already exists!"

            where_clause = construct_partition_query(
                loader.partition_definitions).replace(", ", " and ")
            df_to_load = input_df.where(where_clause)

            count_to_load = df_to_load.count()

            assert df_to_load.count() > 0, "DataFrame to load is empty!"
            loader.load(df_to_load)

            spark_session.catalog.refreshTable(full_table_name)
            assert default_params["table_name"] in [
                tbl.name for tbl in spark_session.catalog.listTables()
            ], "Table was not created!"

            assert (
                spark_session.table(full_table_name).count() == count_to_load
            ), "test partition was not successfully loaded to automatically created output hive table"

            try:
                assert spark_session.sql("show partitions " +
                                         full_table_name).count() > 0
            except Py4JJavaError as e:
                raise AssertionError("Created table is not partitioned. " +
                                     str(e))