def test_check_hql(self, add_tags_patch, _): logging.info("Starting TestCreateTableS3.test_check_hql()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, _, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "PARTITIONED BY (year smallint, month smallint)\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) logging.info("Expected: {0}\n".format(expected_hql)) logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
def test_check_s3_cleanup(self, add_tags_patch, _): logging.info("Starting s3 Checkup cleanup") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] test_content = "sample content" test_lake_key_filename = "test_lake_key" test_land_key_filename = "test_land_key" source_system = "bi" table = "test101" test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_land_key_filename ) test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_lake_key_filename ) # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content) logging.info("Calling M3D.create_table()") M3D.create_table(*table_config_args, **table_config_kwargs) logging.info("Calling M3D.drop_table()") M3D.drop_table(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert 3 == len(fake_cluster.steps) # Get actual HQL statements actual_hqls = [] for step in fake_cluster.steps: assert ["hive", "--silent", "-f"] == step.args[0:3] hql_file = step.args[3] hql_content = self.get_object_content_from_s3(hql_file) actual_hqls.append(hql_content) expected_hqls = [ 'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]), 'DROP TABLE {}.{};'.format(db_lake, destination_table) ] assert expected_hqls == actual_hqls[1:3] # checking landing directory landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 1 assert landing_files[0] == test_land_key # checking lake directory lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 1 assert lake_files[0] == test_lake_key add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 4 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }] assert add_tags_patch_call_args_list[2][0][0] == [{ "Key": "ApiMethod", "Value": "drop_table" }] assert add_tags_patch_call_args_list[3][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
def test_check_hql_single_partitioning(self, add_tags_patch, _): logging.info( "Starting TestCreateTableS3.test_check_hql_single_partitioning()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test103" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) TconxHelper.setup_tconx_from_file( m3d_config_dict["tags"]["config"], destination_system, destination_database, destination_environment, destination_table, S3TableTestBase.single_partition_tconx) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] executed_steps = fake_cluster.steps assert len(executed_steps) == 1 hive_step = executed_steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment][ "schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test103_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test103/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test103(name2 varchar(6), name3 varchar(4))\n" \ "PARTITIONED BY (name1 varchar(21))\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test103/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3( hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) print("Expected: {0}\n".format(expected_hql)) print("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql