def test_empty_table_lakeout(self): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) # Use test case specific tconx py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} with pytest.raises(M3DDatabaseException) as exc_info: M3D.create_lake_out_view(*table_config, **table_config_kwargs) assert "lake_out view name does not exist" == str(exc_info.value)
def load_table(self, emr_cluster_id, spark_parameters=None): if spark_parameters is None: M3D.load_table(*(self.table_config + [self.load_type, emr_cluster_id])) else: M3D.load_table( *(self.table_config + [self.load_type, emr_cluster_id, spark_parameters]))
def test_empty_columns_lakeout(self): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_columns_lakeout/config/empty_cols_cd_lakeout.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) # Use test case specific tconx py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } # Value of TABLE_LAKEOUT column in tconx file table_lakeout = "bi_retail_test" # DB for lake_out db_lake_out = scon_emr_dict["environments"][destination_environment]["schemas"]["lake_out"] db_view_lake_out = db_lake_out + "." + table_lakeout with pytest.raises(M3DDatabaseException) as exc_info: M3D.create_out_view(*table_config, **table_config_kwargs) err_msg = "View {} cannot be created. The view would have no columns.".format(db_view_lake_out) assert err_msg == str(exc_info.value)
def test_lakeout_view_hql(self, add_tags_patch): tconx_src_path = "test/resources/test_create_out_view_hive/test_lakeout_view_structure/config/tconx.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) # Use test case specific tconx py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3) with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4): logging.info("Calling M3D.create_out_view().") M3D.create_out_view(*table_config, **table_config_kwargs) emr_system = EMRSystem(*table_config[:5]) s3_table = S3Table(emr_system, destination_table) mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(mock_cluster.steps) hive_step = mock_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) column_name_pairs = [ ("record_date", "v_record_date"), ("p_string", "v_string"), ("p_int", "v_int"), ("p_bigint", "v_bigint"), ("p_float", "v_float"), ("p_varchar_1", "v_varchar_10"), ("p_varchar_2", "v_varchar_100"), ("p_char_1", "v_char"), ("p_boolean", "v_boolean"), ("year", "year"), ("month", "month") ] columns_str = ", ".join(map(lambda x: "{} AS {}".format(x[0], x[1]), column_name_pairs)) drop_view = "DROP VIEW IF EXISTS {};".format(s3_table.db_view_lake_out) # S3Table is partitioned by year and month create_view = "\n".join([ "CREATE VIEW {}".format(s3_table.db_view_lake_out), "AS", "SELECT {}".format(columns_str), "FROM {};".format(s3_table.db_table_lake) ]) expected_hql = "\n".join([drop_view, create_view]) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_out_view" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetView", "Value": "dev_lake_out.bi_test101" }]
def test_check_s3_cleanup(self, add_tags_patch, _): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_lake_out = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake_out"] lake_out = "bi_test101" logging.info("Calling M3D.drop_out_view()") M3D.drop_out_view(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" actual_hql_content_in_bucket = self.get_object_content_from_s3( hive_step.args[3]) expected_hql = "DROP VIEW IF EXISTS {}.{};".format( db_lake_out, lake_out) assert expected_hql == actual_hql_content_in_bucket add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "drop_out_view" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetView", "Value": "dev_lake_out.bi_test101" }]
def test_check_s3_cleanup(self, add_tags_patch, _): destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_dataset = "nest_nest_test" source_system = "nest" short_dataset_name = "nest_test" m3d_config_file, _, m3d_config_dict, scon_emr_dict = self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment ) dataset_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_dataset ] dataset_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] test_content = "sample content" landing_dir = "{environment}/{source_system}/{dataset}".format( environment=destination_environment, source_system=source_system, dataset=short_dataset_name ) landing_data_dir = os.path.join(landing_dir, "data") landing_data_key = os.path.join(landing_data_dir, "new_landing_dump") lake_dir = "{environment}/{source_system}/{dataset}".format( environment=destination_environment, db_cd=db_lake, source_system=source_system, dataset=short_dataset_name ) lake_data_dir = os.path.join(lake_dir, "data") lake_data_key = os.path.join(lake_data_dir, "new_lake_dump") # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object(Key=landing_data_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content) # checking if landing and lake directories contain the uploaded files landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 1 lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 1 logging.info("Calling M3D.drop_dataset()") M3D.drop_dataset(*dataset_config_args, **dataset_config_kwargs) # checking if the files were removed landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 0 lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 0 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "drop_dataset" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetDataset", "Value": "{}.{}".format(db_lake, destination_dataset) }]
def test_check_s3_cleanup(self, add_tags_patch, _): logging.info("Starting s3 Checkup cleanup") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config_args = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] test_content = "sample content" test_lake_key_filename = "test_lake_key" test_land_key_filename = "test_land_key" source_system = "bi" table = "test101" test_land_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_land_key_filename ) test_lake_key = "{environment}/{source_system}/{table}/data/{obj_name}".format( environment=destination_environment, source_system=source_system, table=table, obj_name=test_lake_key_filename ) # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object(Key=test_land_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=test_lake_key, Body=test_content) logging.info("Calling M3D.create_table()") M3D.create_table(*table_config_args, **table_config_kwargs) logging.info("Calling M3D.drop_table()") M3D.drop_table(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert 3 == len(fake_cluster.steps) # Get actual HQL statements actual_hqls = [] for step in fake_cluster.steps: assert ["hive", "--silent", "-f"] == step.args[0:3] hql_file = step.args[3] hql_content = self.get_object_content_from_s3(hql_file) actual_hqls.append(hql_content) expected_hqls = [ 'DROP TABLE {}.{}{};'.format(db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]), 'DROP TABLE {}.{};'.format(db_lake, destination_table) ] assert expected_hqls == actual_hqls[1:3] # checking landing directory landing_files = [k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all()] assert len(landing_files) == 1 assert landing_files[0] == test_land_key # checking lake directory lake_files = [k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all()] assert len(lake_files) == 1 assert lake_files[0] == test_lake_key add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 4 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }] assert add_tags_patch_call_args_list[2][0][0] == [{ "Key": "ApiMethod", "Value": "drop_table" }] assert add_tags_patch_call_args_list[3][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
def test_check_hql(self, add_tags_patch, _): logging.info("Starting TestCreateTableS3.test_check_hql()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" m3d_config_file, _, _, _, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = { "emr_cluster_id": self.emr_cluster_id } logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) hive_step = fake_cluster.steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment]["schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment]["schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test101_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test101(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "PARTITIONED BY (year smallint, month smallint)\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test101/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) logging.info("Expected: {0}\n".format(expected_hql)) logging.info("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "create_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1): # responses.add_passthru(self.default_server_url) destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_active_table = "bi_test101" destination_changelog_table = "bi_test101_cl" load_type = "DeltaLoad" src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json" src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' # pass desired content of tconx files for active and changelog tables to self.env_setup() src_tconx_content = py.path.local(src_tconx_path).read() src_tconx_cl_content = py.path.local(src_tconx_cl_table).read() m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_active_table, src_tconx_content, src_tconx_cl_content ) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) s3_table_active = S3Table(emr_system, destination_active_table) s3_table_changelog = S3Table(emr_system, destination_changelog_table) # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] # Put lake data for changelog table, this should be archived self.dump_data_to_s3( os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"), "t|e|s|t|a|d|i|d|a|s|m|3|d|", ) M3D.load_table(m3d_config_file, destination_system, destination_database, destination_environment, destination_active_table, load_type, self.emr_cluster_id, spark_params=spark_external_parameters) filename_json = "delta_load-{environment}-{table}.json".format( environment=destination_environment, table=destination_active_table) # Checking configuration file for m3d-engine app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load) assert len(app_files) == 1 assert app_files[ 0] == s3_table_active.dir_apps_delta_load + filename_json delta_load_config_s3 = app_files[0] delta_load_config_content = self.get_object_content_from_s3( delta_load_config_s3) load_table_parameters = json.loads(delta_load_config_content) assert load_table_parameters[ "active_records_table_lake"] == s3_table_active.db_table_lake assert load_table_parameters[ "active_records_dir_lake"] == s3_table_active.dir_lake_final assert load_table_parameters[ "delta_records_file_path"] == s3_table_active.dir_landing_data assert load_table_parameters["technical_key"] == [ "m3d_timestamp", "datapakid", "partno", "record" ] assert load_table_parameters[ "business_key"] == s3_table_active.business_key if s3_table_active.partitioned_by in Util.defined_partitions: target_partitions = Util.get_target_partitions_list( s3_table_active.partitioned_by) else: target_partitions = s3_table_active.partitioned_by assert load_table_parameters["target_partitions"] == target_partitions assert load_table_parameters[ "partition_column"] == s3_table_active.partition_column assert load_table_parameters[ "partition_column_format"] == s3_table_active.partition_column_format # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) delta_load_step = fake_cluster.steps[0] assert delta_load_step.jar == "command-runner.jar" assert delta_load_step.args[0] == "spark-submit" assert delta_load_step.args[ -5] == "com.adidas.analytics.AlgorithmFactory" assert delta_load_step.args[-4] == expected_algorithms_jar_path assert delta_load_step.args[-3] == "DeltaLoad" assert delta_load_step.args[-2] == delta_load_config_s3 assert delta_load_step.args[-1] == "s3" add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted([{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "DeltaLoad" }, { "Key": "TargetTable", "Value": "bi_test101" }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
def test_run_algorithm(self, email_patch, delete_object_patch, add_tags_patch): m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment) schema_lake = scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] bucket_lake = scon_emr_dict["environments"][ self.destination_environment]["s3_buckets"]["lake"] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } ext_params_dict = {"environment": {"spark": spark_options}} algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, self.emr_cluster_id, json.dumps(ext_params_dict) ] fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] expected_step_count = 1 timeout_seconds = 6 emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=expected_step_count, timeout_seconds=timeout_seconds) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args) logging.info("Number of steps after execution: {}".format( len(fake_cluster.steps))) # Check the successful execution of algorithm email_patch.assert_called_once() call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success") assert len(fake_cluster.steps) == expected_step_count spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "NestedFlattener" spark_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" logging.info("Checking {}".format(spark_json_s3)) # check that we tried to delete it delete_object_patch.assert_called_once() delete_object_call_args, _ = delete_object_patch.call_args assert str(delete_object_call_args[0]) == spark_json_s3 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "SourceTable", "Value": "s3://m3d-dev-lake/nest/nest_test/data" }, { "Key": "TargetTable", "Value": "dev_lake.nest_flattened" }], key=lambda x: x["Key"]) assert sorted(add_tags_patch_call_args_list[1][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmNestedFlattener" }, { "Key": "AlgorithmInstance", "Value": "nested_flattener" }], key=lambda x: x["Key"]) # check content of config.json file spark_json_content = self.get_object_content_from_s3(spark_json_s3) spark_json_dict = json.loads(spark_json_content) assert spark_json_dict["source_location"] == os.path.join( ConfigService.Protocols.S3, bucket_lake, "nest/nest_test/data") assert spark_json_dict[ "target_table"] == schema_lake + "." + "nest_flattened" assert spark_json_dict["fields_to_flatten"] == [ "user_attributes", "device_info", "events", "events__data", "events__data__device_current_state" ] assert spark_json_dict["column_mapping"] == { "batch_id": "batch_id", "environment": "environment", "timestamp_unixtime_ms": "event_timestamp", "message_type": "message_type", "device_info__brand": "device_brand", "device_info__network_country": "network_country", "events__event_type": "event_type", "events__data__screen_name": "screen_name", "events__data__device_current_state__total_system_memory_usage_bytes": "memory_usage_bytes" } assert spark_json_dict["chars_to_replace"] == "[.:#]+" assert spark_json_dict["replacement_char"] == "_"
def test_run_algorithm(self, add_tags_patch, delete_object_patch, send_email_patch): parameters_dict = { "scala_class": "CustomScalaClass", "key_el": "val", "key_list": ["x", 15], "key_dict": { "first": 1, "second": "2nd" } } acon_dict = { "algorithm": { "python_class": "AlgorithmScalaRunner", "parameters": parameters_dict } } m3d_config_file, scon_emr_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, acon_dict ) algorithm_args = [ m3d_config_file, self.cluster_mode, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance ] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "35G", "spark.executor.instances": 12, "spark.executor.cores": 2, "spark.scheduler.mode": "FAIR" } ext_params_dict = { "environment": { "emr_cluster_id": self.emr_cluster_id, "spark": spark_options } } algorithm_kwargs = {"ext_params": json.dumps(ext_params_dict)} emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=1, timeout_seconds=3) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args, **algorithm_kwargs) # Check EMR step mock_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert len(mock_cluster.steps) == 1 spark_step = mock_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "CustomScalaClass" config_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" # Check config.json file content config_json_content = self.get_object_content_from_s3(config_json_s3) config_json_dict = json.loads(config_json_content) assert config_json_dict == parameters_dict # Check that config.json was removed in the end delete_object_patch.assert_called_once() delete_object_patch_call_args, _ = delete_object_patch.call_args assert delete_object_patch_call_args == (config_json_s3, ) # Check the successful execution of algorithm send_email_patch.assert_called_once() send_email_patch_call_args, _ = send_email_patch.call_args assert str(send_email_patch_call_args[1]).startswith("Success") add_tags_patch.assert_called_once() add_tags_patch_call_args, _ = add_tags_patch.call_args assert sorted(add_tags_patch_call_args[0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmScalaRunner" }, { "Key": "AlgorithmInstance", "Value": "scala_runner_custom" }], key=lambda x: x["Key"])
def test_full_load_emr(self, _0, _1): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" load_type = "FullLoad" landing_dataset = "landing-dataset.psv" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id, spark_external_parameters ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config) # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have # old files in it and dir_landing_archive will not have new files landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) # Check args of spark-submit EMR step spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3"
def test_full_load_emr_external_spark_parameters(self, _0): tconx_src_path = \ "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json" acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json" destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" spark_external_parameters = { "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } load_type = "FullLoad" landing_dataset = "landing-dataset.psv" m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ super(TestLoadTableFullS3, self).env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"], destination_database, destination_environment, destination_table, acon_src_path) py.path.local(tconx_file).write(py.path.local(tconx_src_path).read()) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table, load_type, self.emr_cluster_id ] # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) test_s3_table = S3Table(emr_system, destination_table) # Put landing data self.dump_data_to_s3( os.path.join(test_s3_table.dir_landing_final, landing_dataset), "t|e|s|t|a|d|i|d|a|s|m|3|d|") M3D.load_table(*table_config, spark_params=json.dumps(spark_external_parameters)) # psv file will still be in landing since move operation should be # performed by EMR Step which we mock here. Accordingly archive will # still be empty. landing_files = self.get_child_objects(test_s3_table.dir_landing_final) assert len(landing_files) == 1 assert landing_files[0] == os.path.join( test_s3_table.dir_landing_final, landing_dataset) landing_archive_files = self.get_child_objects( test_s3_table.dir_landing_archive) assert len(landing_archive_files) == 0 # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_external_parameters.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-4] == expected_algorithms_jar_path assert spark_step.args[-3] == "FullLoad" assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \ "full_load/full_load-dev-bi_test101.json" assert spark_step.args[-1] == "s3"
def test_run_algorithm(self, email_patch, delete_object_patch, add_tags_patch): m3d_config_file, _, acon_path, _, scon_emr_dict = self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment) schema_lake = scon_emr_dict["environments"][ self.destination_environment]["schemas"]["lake"] spark_options = { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } ext_params_dict = {"environment": {"spark": spark_options}} algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, self.emr_cluster_id, json.dumps(ext_params_dict) ] fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] expected_step_count = 1 timeout_seconds = 6 emr_steps_completer = self.create_emr_steps_completer( expected_steps_count=expected_step_count, timeout_seconds=timeout_seconds) with ConcurrentExecutor(emr_steps_completer): M3D.run_algorithm(*algorithm_args) logging.info("Number of steps after execution: {}".format( len(fake_cluster.steps))) # Check the successful execution of algorithm email_patch.assert_called_once() call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success") assert len(fake_cluster.steps) == expected_step_count spark_step = fake_cluster.steps[0] assert spark_step.jar == "command-runner.jar" assert spark_step.args[0] == "spark-submit" assert spark_step.args[5] == "--conf" assert spark_step.args[7] == "--conf" assert spark_step.args[9] == "--conf" assert spark_step.args[11] == "--conf" assert spark_step.args[13] == "--conf" expected_spark_conf_options = set( map(lambda p: "{}={}".format(p[0], p[1]), spark_options.items())) actual_spark_conf_options = set( map(lambda x: spark_step.args[x], [6, 8, 10, 12, 14])) assert expected_spark_conf_options == actual_spark_conf_options assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory" assert spark_step.args[-3] == "Transpose" spark_json_s3 = spark_step.args[-2] assert spark_step.args[-1] == "s3" logging.info("Checking {}".format(spark_json_s3)) # check that we tried to delete it delete_object_patch.assert_called_once() delete_object_call_args, _ = delete_object_patch.call_args assert str(delete_object_call_args[0]) == spark_json_s3 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "SourceTable", "Value": schema_lake + "." + "pretranspose" }, { "Key": "TargetTable", "Value": schema_lake + "." + "transpose" }], key=lambda x: x["Key"]) assert sorted(add_tags_patch_call_args_list[1][0][0], key=lambda x: x["Key"]) == sorted( [{ "Key": "ApiMethod", "Value": "run_algorithm" }, { "Key": "AlgorithmClass", "Value": "AlgorithmTranspose" }, { "Key": "AlgorithmInstance", "Value": "transpose" }], key=lambda x: x["Key"]) # check content of config.json file spark_json_content = self.get_object_content_from_s3(spark_json_s3) spark_json_dict = json.loads(spark_json_content) assert spark_json_dict[ "source_table"] == schema_lake + "." + "pretranspose" assert spark_json_dict[ "target_table"] == schema_lake + "." + "transpose" assert spark_json_dict["group_by_column"] == [ "product", "articleNo", "FactoryID" ] assert spark_json_dict["pivot_column"] == "name" assert spark_json_dict["aggregation_column"] == "value"
def test_run_algorithm(self): m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, self.destination_system, self.destination_database, self.destination_environment, self.destination_table ) _, acon_dict = AconHelper.setup_acon_from_file( m3d_config_dict["tags"]["config"], self.destination_database, self.destination_environment, self.algorithm_instance, self.test_acon ) algorithm_args = [ m3d_config_file, self.destination_system, self.destination_database, self.destination_environment, self.algorithm_instance, ] algorithm_kwargs = { "emr_cluster_id": self.emr_cluster_id, "ext_params": json.dumps({ "environment": { "spark": { "spark.driver.memory": "5G", "spark.executor.memory": "20G", "spark.executor.instances": 10, "spark.executor.cores": 1, "spark.scheduler.mode": "FAIR" } }, "algorithm": { "destination_table": self.destination_table, } }) } bucket_landing = scon_emr_dict["environments"][self.destination_environment]["s3_buckets"]["landing"] expected_param_dict = { "directory": "s3://{bucket}/dev/bi/{table}/data/".format( bucket=bucket_landing, table=self.table ), "format": "csv", "thread_pool_size": 8 } def run_command_in_cluster_patch(cmd, name): # Check command name assert "Running Spark Application" in str(name) logging.info("Command is: {0}".format(cmd)) command_components = cmd.split() # Check algorithm name from the spark command algorithm_class_name = command_components[-3] assert algorithm_class_name == ScalaClasses.GZIP_DECOMPRESSOR # Check configuration file content algorithm_config_file_name = command_components[-2] actual_config_file_content = self.get_object_content_from_s3(algorithm_config_file_name) logging.info("Actual config content: {0}".format(actual_config_file_content)) algorithm_config_file_dict = json.loads(actual_config_file_content) assert algorithm_config_file_dict == expected_param_dict with patch("m3d.hadoop.emr.emr_system.EMRSystem.run_command_in_cluster", side_effect=run_command_in_cluster_patch): with patch("m3d.util.util.Util.send_email") as email_patch: M3D.run_algorithm(*algorithm_args, **algorithm_kwargs) # Check the successful execution of algorithm call_args, _ = email_patch.call_args assert str(call_args[1]).startswith("Success")
def test_check_s3_cleanup(self, add_tags_patch, _): cluster_mode = False destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test101" source_system = "bi" table = "test101" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table) table_config_args = [ m3d_config_file, cluster_mode, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} db_landing = scon_emr_dict["environments"][destination_environment][ "schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake"] bucket_landing = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["landing"] bucket_lake = scon_emr_dict["environments"][destination_environment][ "s3_buckets"]["lake"] test_content = "sample content" landing_dir = "{environment}/{source_system}/{table}".format( environment=destination_environment, source_system=source_system, table=table) landing_data_dir = os.path.join(landing_dir, "data") landing_archive_dir = os.path.join(landing_dir, "archive") landing_work_dir = os.path.join(landing_dir, "work") landing_data_key = os.path.join(landing_data_dir, "new_landing_dump") landing_archive_key = os.path.join(landing_archive_dir, "old_dump.gz") landing_work_key = os.path.join(landing_work_dir, "temporary_data") lake_dir = "{environment}/{source_system}/{table}".format( environment=destination_environment, db_cd=db_lake, source_system=source_system, table=table) lake_data_dir = os.path.join(lake_dir, "data") lake_data_key = os.path.join(lake_data_dir, "new_lake_dump") # adding data to landing and lake directories self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_data_key, Body=test_content) self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_archive_key, Body=test_content) self.s3_resource.Bucket(bucket_landing).put_object( Key=landing_work_key, Body=test_content) self.s3_resource.Bucket(bucket_lake).put_object(Key=lake_data_key, Body=test_content) logging.info("Calling M3D.truncate_table()") M3D.truncate_table(*table_config_args, **table_config_kwargs) emr_backend = self.mock_emr.backends[self.default_aws_region] fake_cluster = emr_backend.clusters[self.emr_cluster_id] assert len(fake_cluster.steps) == 2 # Get actual HQL statements actual_hqls = [] for step in fake_cluster.steps: assert ["hive", "--silent", "-f"] == step.args[0:3] hql_file = step.args[3] hql_content = self.get_object_content_from_s3(hql_file) actual_hqls.append(hql_content) db_table_landing = "{}.{}{}".format( db_landing, destination_table, m3d_config_dict["tags"]["table_suffix_stage"]) landing_table_location = os.path.join("s3://", bucket_landing, landing_data_dir, "") db_table_lake = "{}.{}".format(db_lake, destination_table) lake_table_location = os.path.join("s3://", bucket_lake, lake_data_dir, "") landing_hql = "ALTER TABLE {} SET LOCATION \"{}\";".format( db_table_landing, landing_table_location) lake_hql = "\n".join([ "DROP TABLE {};".format(db_table_lake), TestTruncateTableS3Integration._get_table_ddl_lake( db_table_lake, lake_table_location), "MSCK REPAIR TABLE {};".format(db_table_lake) ]) expected_hqls = [landing_hql, lake_hql] assert actual_hqls == expected_hqls # checking landing directory landing_files = [ k.key for k in self.s3_resource.Bucket(bucket_landing).objects.all() ] assert len(landing_files) == 0 # checking lake directory lake_files = [ k.key for k in self.s3_resource.Bucket(bucket_lake).objects.all() ] assert len(lake_files) == 0 add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 2 assert add_tags_patch_call_args_list[0][0][0] == [{ "Key": "ApiMethod", "Value": "truncate_table" }] assert add_tags_patch_call_args_list[1][0][0] == [{ "Key": "TargetTable", "Value": "dev_lake.bi_test101" }]
def test_check_hql_single_partitioning(self, add_tags_patch, _): logging.info( "Starting TestCreateTableS3.test_check_hql_single_partitioning()") destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_table = "bi_test103" m3d_config_file, _, _, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_table ) TconxHelper.setup_tconx_from_file( m3d_config_dict["tags"]["config"], destination_system, destination_database, destination_environment, destination_table, S3TableTestBase.single_partition_tconx) table_config = [ m3d_config_file, destination_system, destination_database, destination_environment, destination_table ] table_config_kwargs = {"emr_cluster_id": self.emr_cluster_id} logging.info("Calling M3D.create_table().") M3D.create_table(*table_config, **table_config_kwargs) fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] executed_steps = fake_cluster.steps assert len(executed_steps) == 1 hive_step = executed_steps[0] assert hive_step.args[0] == "hive" assert hive_step.args[1] == "--silent" assert hive_step.args[2] == "-f" db_landing = scon_emr_dict["environments"][destination_environment][ "schemas"]["landing"] db_lake = scon_emr_dict["environments"][destination_environment][ "schemas"]["lake"] ddl_landing = "CREATE DATABASE IF NOT EXISTS dev_landing;\n" \ "CREATE DATABASE IF NOT EXISTS dev_lake;\n" \ "CREATE EXTERNAL TABLE dev_landing.bi_test103_stg1(name1 varchar(21), name2 varchar(6), " \ "name3 varchar(4))\n" \ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\\\' LINES TERMINATED BY '\\n'\n" \ "LOCATION 's3://m3d-dev-landing/dev/bi/test103/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" ddl_lake = "CREATE EXTERNAL TABLE dev_lake.bi_test103(name2 varchar(6), name3 varchar(4))\n" \ "PARTITIONED BY (name1 varchar(21))\n" \ "STORED AS PARQUET\n" \ "LOCATION 's3://m3d-dev-lake/dev/bi/test103/data/'\n" \ "TBLPROPERTIES(\"serialization.encoding\"=\"UTF-8\");" # Get content of hql in s3 bucket actual_hql_content_in_bucket = self.get_object_content_from_s3( hive_step.args[3]) expected_hql = \ ddl_landing + "\n" + \ "MSCK REPAIR TABLE {}.{}_stg1;".format(db_landing, destination_table) + "\n" + \ ddl_lake + "\n" + \ "MSCK REPAIR TABLE {}.{};".format(db_lake, destination_table) print("Expected: {0}\n".format(expected_hql)) print("Actual: {0}\n".format(actual_hql_content_in_bucket)) assert actual_hql_content_in_bucket == expected_hql