def test_insert_to_non_existent_table(self): dataset = SparkHiveDataSet(database="default_1", table="table_not_yet_created", write_mode="insert") dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load().sort("name"), _generate_spark_df_one().sort("name"))
def test_read_from_non_existent_table(self): dataset = SparkHiveDataSet(database="default_1", table="table_doesnt_exist", write_mode="insert") with pytest.raises( DataSetError, match="requested table not found: default_1.table_doesnt_exist", ): dataset.load()
def test_overwrite_empty_table(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_overwrite_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( database="default_1", table="test_overwrite_empty_table", write_mode="overwrite", ) dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load(), _generate_spark_df_one())
def test_invalid_schema_insert(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_invalid_schema_insert " "(name string, additional_column_on_hive integer)").take(1) dataset = SparkHiveDataSet( database="default_1", table="test_invalid_schema_insert", write_mode="insert", ) with pytest.raises( DataSetError, match=r"dataset does not match hive table schema\.\n" r"Present on insert only: \[\('age', 'int'\)\]\n" r"Present on schema only: \[\('additional_column_on_hive', 'int'\)\]", ): dataset.save(_generate_spark_df_one())
def test_upsert_config_err(self): # no pk provided should prompt config error with pytest.raises( DataSetError, match="table_pk must be set to utilise upsert read mode"): SparkHiveDataSet(database="default_1", table="table_1", write_mode="upsert")
def test_cant_pickle(self): import pickle # pylint: disable=import-outside-toplevel with pytest.raises(pickle.PicklingError): pickle.dumps( SparkHiveDataSet(database="default_1", table="table_1", write_mode="overwrite"))
def test_cant_pickle(self): import pickle with pytest.raises(pickle.PicklingError): pickle.dumps( SparkHiveDataSet(database="default_1", table="table_1", write_mode="overwrite"))
def test_invalid_write_mode_provided(self): with pytest.raises( DataSetError, match="Invalid write_mode provided: not_a_write_mode"): SparkHiveDataSet( database="default_1", table="table_1", write_mode="not_a_write_mode", table_pk=["name"], )
def test_invalid_pk_provided(self): with pytest.raises( DataSetError, match=r"columns \[column_doesnt_exist\] selected as PK not " r"found in table default_1\.table_1", ): SparkHiveDataSet( database="default_1", table="table_1", write_mode="upsert", table_pk=["column_doesnt_exist"], )
def test_upsert_not_empty_table(self, spark_hive_session): spark_hive_session.sql( "create table default_1.test_upsert_not_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( database="default_1", table="test_upsert_not_empty_table", write_mode="upsert", table_pk=["name"], ) dataset.save(_generate_spark_df_one()) dataset.save(_generate_spark_df_upsert()) assert_df_equal( dataset.load().sort("name"), _generate_spark_df_upsert_expected().sort("name"), )
def test_read_existing_table(self): dataset = SparkHiveDataSet(database="default_1", table="table_1", write_mode="overwrite") assert_df_equal(_generate_spark_df_one(), dataset.load())