def test_insert_to_non_existent_table(self): dataset = SparkHiveDataSet(database="default_1", table="table_not_yet_created", write_mode="insert") dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load().sort("name"), _generate_spark_df_one().sort("name"))
def test_overwrite_empty_table(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_overwrite_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( database="default_1", table="test_overwrite_empty_table", write_mode="overwrite", ) dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load(), _generate_spark_df_one())
def test_invalid_schema_insert(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_invalid_schema_insert " "(name string, additional_column_on_hive integer)").take(1) dataset = SparkHiveDataSet( database="default_1", table="test_invalid_schema_insert", write_mode="insert", ) with pytest.raises( DataSetError, match=r"dataset does not match hive table schema\.\n" r"Present on insert only: \[\('age', 'int'\)\]\n" r"Present on schema only: \[\('additional_column_on_hive', 'int'\)\]", ): dataset.save(_generate_spark_df_one())
def test_upsert_not_empty_table(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_upsert_not_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( database="default_1", table="test_upsert_not_empty_table", write_mode="upsert", table_pk=["name"], ) dataset.save(_generate_spark_df_one()) dataset.save(_generate_spark_df_upsert()) assert_df_equal( dataset.load().sort("name"), _generate_spark_df_upsert_expected().sort("name"), )