def _generate_version_file(self): def read_deployment_history(): if os.path.exists(self.version_filename): with open(self.version_filename, 'r') as version_file: return version_file.read().strip() else: return None def update_deployment_history(current_line, history=None): with open(self.version_filename, 'w') as version_file: if history is None: version_file.writelines([current_line + "\n"]) else: version_file.writelines([current_line + "\n", history + "\n"]) deployment_history = read_deployment_history() current_time = datetime.datetime.now() branch_name = Util.execute_subprocess("git status | grep -E 'On branch .*' | tail -c +11") last_commit = Util.execute_subprocess("git log -1 | grep -E 'commit .*' | tail -c +8") current_deployment = "{date} {branch} ({commit})".format( date=current_time.strftime("%Y-%m-%d %H:%M:%S"), branch=branch_name.strip(), commit=last_commit.strip() ) update_deployment_history(current_deployment, deployment_history)
def test_send_mail(self, os_system_patch): Util.send_email(["*****@*****.**"], "hello", "hello") os_system_patch.assert_has_calls([ call( 'echo "hello" | mailx -s "hello" [email protected]' ) ])
def _report_error(self, name): error_subject = "Error for " + name exec_tb = traceback.format_exc() message = "Error in executing {}. \n Stacktrace: \n {}".format( name, exec_tb) logging.error(error_subject) Util.send_email(self._execution_system.config_service.emails, error_subject, message)
def test_get_target_partitions_string(self): """ This method tests the correct functionality of get_partition_column_string of Util class :return: """ assert Util.get_target_partitions_string("year") == "year" assert Util.get_target_partitions_string("month") == "year,month" assert Util.get_target_partitions_string("day") == "year,month,day" assert Util.get_target_partitions_string("") == "" assert Util.get_target_partitions_string("country") == "country"
def test_get_defined_partition_columns_hive(self): """ This method tests the correct functionality of get_partition_column_string of Util class :return: """ assert Util.get_defined_target_partitions_hive( "year") == "year smallint" assert Util.get_defined_target_partitions_hive( "month") == "year smallint,month smallint" assert Util.get_defined_target_partitions_hive( "day") == "year smallint,month smallint,day smallint" assert Util.get_defined_target_partitions_hive("") == ""
def test_get_target_partitions_list(self): """ This method tests the correct functionality of get_partition_column_list of Util class :return: """ assert Util.get_target_partitions_list("year") == ["year"] assert Util.get_target_partitions_list("month") == ["year", "month"] assert Util.get_target_partitions_list("day") == [ "year", "month", "day" ] assert Util.get_target_partitions_list("") == [] assert Util.get_target_partitions_list("country") == ["country"]
def create_with_emr_cluster_id(config_path, cluster_mode, destination_database, destination_environment, algorithm_instance, emr_cluster_id): """ Create algorithm configuration object from acon file. Method will discover acon file based on the parameters passed to it. :return: Returns algorithm configuration object of the type that is used for calling the method. """ # Create config service to get acon file path. config_service = ConfigService(config_path) acon_path = config_service.get_acon_path(cluster_mode, destination_database, destination_environment, algorithm_instance) acon_dict = Util.load_dict(acon_path) environment = acon_dict[ AlgorithmConfigurationHadoop.Sections.ENVIRONMENT] environment[ AlgorithmConfigurationHadoop.Keys.EMR_CLUSTER_ID] = emr_cluster_id return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
def __init__(self, execution_system, algorithm_instance, algorithm_params): """ Initialize generic Algorithm class :param execution_system: an instance of execution system :param algorithm_instance: name of the algorithm instance :param algorithm_params: algorithm configuration """ self._execution_system = execution_system self._parameters = algorithm_params.get(AlgorithmConfigurationHadoop.Keys.PARAMETERS, {}) param_file_basename = "{system}-{database}-{environment}.{algorithm}.{time}{extension}".format( system=self._execution_system.source_system, database=self._execution_system.database, environment=self._execution_system.environment, algorithm=algorithm_instance, time=Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT), extension=ConfigService.Extensions.JSON ) # derived dir_apps_algorithm_instance = os.path.join( self._execution_system.dir_apps_algorithm, algorithm_instance ) self._params_uri_cluster = os.path.join(dir_apps_algorithm_instance, param_file_basename) self._params_uri_local = os.path.join(self._execution_system.config_service.dir_exec, param_file_basename)
def _get_create_lake_statement(self, table_location): def create_statement(_columns, _target_partitions=None): return HQLGenerator.CreateParquetTableStatementBuilder(self.db_table_lake, table_location, _columns) \ .partitioned_by(_target_partitions) \ .with_properties({"serialization.encoding": "UTF-8"}) \ .build(is_external=True) if self.partitioned_by in Util.defined_partitions: return create_statement( self.columns_lake, Util.get_typed_target_partitions_hive(self.partitioned_by)) elif len(self.partitioned_by) > 0: matched_columns = list( filter(lambda x: x[0] == self.partitioned_by, self.columns_lake)) if len(matched_columns) > 0: # when table is partitioned by one of its columns # then partition column should to excluded from list of regular columns columns = filter(lambda x: x[0] != self.partitioned_by, self.columns_lake) target_partitions = [(matched_columns[0][0], matched_columns[0][1])] return create_statement(columns, target_partitions) else: raise Exception( "Partitioned field doesn't match any column".format( self.partitioned_by)) else: return create_statement(self.columns_lake)
def get_projection_columns(self, src_column_names, destination_column_names): columns = list(filter(lambda x: x[1], zip(src_column_names, destination_column_names))) if self.partitioned_by in Util.defined_partitions: partition_columns = list(map(lambda x: (x, x), Util.get_partition_columns_list(self.partitioned_by))) return columns + partition_columns else: return columns
def read_acon_params(execution_system, table_name): config_service = ConfigService(execution_system.config) acon_path = config_service.get_acon_path(execution_system.database, execution_system.environment, table_name) acon_dict = Util.load_dict(acon_path) return acon_dict.get(LoadHadoop.PARAMETERS_KEY, {})
def create_with_ext_params(config_path, cluster_mode, destination_database, destination_environment, algorithm_instance, ext_params_str): """ Create algorithm configuration object from acon file. Method will discover acon file based on the parameters passed to it. :return: Returns algorithm configuration object of the type that is used for calling the method. """ # Create config service to get acon file path. config_service = ConfigService(config_path) acon_path = config_service.get_acon_path(cluster_mode, destination_database, destination_environment, algorithm_instance) acon_dict = Util.load_dict(acon_path) if ext_params_str: ext_params_dict = json.loads(ext_params_str) acon_dict = Util.merge_nested_dicts(acon_dict, ext_params_dict) return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
def setup_acon_from_file(config_dir_path, destination_database, destination_environment, algorithm_instance, base_acon_path): acon_file_path = AconHelper.get_acon_file_path( config_dir_path, destination_database, destination_environment, algorithm_instance) if not os.path.isdir(os.path.dirname(acon_file_path)): os.makedirs(os.path.dirname(acon_file_path)) py.path.local(acon_file_path).write( py.path.local(base_acon_path).read()) acon_dict = Util.load_dict(base_acon_path) return acon_file_path, acon_dict
def test_get_target_partitions_string(self): """ This method tests the correct functionality of get_partition_column_string of Util class :return: """ assert Util.get_target_partitions_string("year") == "year" assert Util.get_target_partitions_string("month") == "year,month" assert Util.get_target_partitions_string("day") == "year,month,day" assert Util.get_target_partitions_string("") == "" with pytest.raises(Exception) as exc_info: Util.get_target_partitions_list("country") assert "Partition type country not supported" in str(exc_info.value)
def execute_hive(self, hql, return_output=False): # Put HQL statement to a file since it can be longer than allowed length of EMR step parameter. datetime_str = Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT) id_str = EMRSystem._generate_random_id() hql_filename = "{}.{}{}".format(datetime_str, id_str, ConfigService.Extensions.HQL) hql_path_local = os.path.join(self.dir_tmp_local, hql_filename) hql_path_s3 = os.path.join(self.dir_tmp_s3, hql_filename) with open(hql_path_local, "w") as hql_file: hql_file.write(hql) self.s3_util.upload_object(hql_path_local, hql_path_s3) # Create hive command line. hive_cmd = "hive --silent -f {}".format(hql_path_s3) # Add step to EMR cluster. step_name = "Hive EMR Step: datetime=\"{}\", id=\"{}\"".format( datetime_str, id_str) emr_step_id = self.emr_cluster_client.add_step(step_name, hive_cmd) self.emr_cluster_client.wait_for_step_completion(emr_step_id) if return_output: output_file = self.emr_cluster_client.get_step_output_path( emr_step_id) logging.info( "Waiting for availability of output file: '{}'.".format( output_file)) self.s3_util.wait_for_file_availability( output_file, self.emr_cluster_client.polling_interval_seconds, EMRClusterClient.AWSConstants. S3_FILE_AVAILABILITY_TIMEOUT_SECONDS) file_content = self.s3_util.read_gzip_file_content(output_file) return file_content return None
def test_oracle_view_to_hive_view(self): oracle_view_ddl = \ "\n CREATE OR REPLACE FORCE EDITIONABLE VIEW \"MART_MOD\".\"TEST_VIEW\" (" + \ "" + \ "\"GENDER\", \"GROUP_ARTICLE\", \"BRAND\", \"GROUP_MODEL\", " + \ "\"RMH_PRODUCT_DIVISION\", \"RMH_GENDER\", \"RMH_CATEGORY\", \"RMH_PRODUCT_TYPE\", " + \ "\"BUSINESS_SEGMENT\", \"BUSINESS_UNIT\", \"COLORWAY_NAME\", \"SEASON_ACTIVE\", " + \ "\"SEASON_CREATE\", \"SIZE_PAGE\", \"KEY_CATEGORY\", \"SUB_BRAND\", " + \ "\"CORPORATE_MARKETING_LINE\", \"PRODUCT_DIVISION\", \"ORDER_LOCKED\", \"PRODUCT_GROUP\", " + \ "\"PRODUCT_TYPE\", \"SPORTS_CATEGORY\", \"SOURCING_SIZE_SCALE\", \"RMH_RETAIL_CLASS\", " + \ "\"RMH_RETAIL_DEPARTMENT\", \"RMH_RETAIL_SUB_CLASS\", \"RMH_RETAIL_SUB_DEPT\", \"RMH_RETAIL_SECTION\", " + \ "\"AGE_GROUP\", \"ALTERNATE_ARTICLE\", \"ARTICLE_TYPE\", \"COLORWAY_LONG_DESCR\", " + \ "\"COLORWAY_SHORT_DESCR\", \"LIFECYCLE_STATUS_DATE\", \"ORIGINAL_ARTICLE\", \"ARTICLE_DESCR\", " + \ "\"VENDOR_ARTICLE\", \"SALES_LINE\", \"CATEGORY_MARKETING_LINE\"" + \ "" + \ ") AS \n SELECT \n" + \ "" + \ "gender,\ngroup_article,\nbrand,\ngroup_model,\n" + \ "rmh_product_division,\nrmh_gender,\nrmh_category,\nrmh_product_type,\n" + \ "business_segment,\nbusiness_unit,\ncolorway_name,\nseason_active,\n" + \ "season_create,\nsize_page,\nkey_category,\nsub_brand,\n" + \ "corporate_marketing_line,\nproduct_division,\norder_locked,\nproduct_group,\n" + \ "product_type,\nsports_category,\nsourcing_size_scale,\nrmh_retail_class,\n" + \ "rmh_retail_department,\nrmh_retail_sub_class,\nrmh_retail_sub_dept,\nrmh_retail_section,\n" + \ "age_group,\nalternate_article,\narticle_type,\ncolorway_long_descr,\n" + \ "colorway_short_descr,\nlifecycle_status_date,\noriginal_article,\narticle_descr,\n" + \ "vendor_article,\nSALES_LINE,\n" + \ "category_marketing_line\n" + \ "" + \ "FROM \n" + \ "lake_out.bi_test_view" hive_view_ddl = Util.oracle_view_to_hive_view(oracle_view_ddl) expected_hive_ddl = \ "CREATE VIEW `MART_MOD`.`TEST_VIEW` " + \ "(" + \ "`GENDER`, `GROUP_ARTICLE`, `BRAND`, `GROUP_MODEL`, " + \ "`RMH_PRODUCT_DIVISION`, `RMH_GENDER`, `RMH_CATEGORY`, `RMH_PRODUCT_TYPE`, " + \ "`BUSINESS_SEGMENT`, `BUSINESS_UNIT`, `COLORWAY_NAME`, `SEASON_ACTIVE`, " + \ "`SEASON_CREATE`, `SIZE_PAGE`, `KEY_CATEGORY`, `SUB_BRAND`, " + \ "`CORPORATE_MARKETING_LINE`, `PRODUCT_DIVISION`, `ORDER_LOCKED`, `PRODUCT_GROUP`, " + \ "`PRODUCT_TYPE`, `SPORTS_CATEGORY`, `SOURCING_SIZE_SCALE`, `RMH_RETAIL_CLASS`, " + \ "`RMH_RETAIL_DEPARTMENT`, `RMH_RETAIL_SUB_CLASS`, `RMH_RETAIL_SUB_DEPT`, `RMH_RETAIL_SECTION`, " + \ "`AGE_GROUP`, `ALTERNATE_ARTICLE`, `ARTICLE_TYPE`, `COLORWAY_LONG_DESCR`, " + \ "`COLORWAY_SHORT_DESCR`, `LIFECYCLE_STATUS_DATE`, `ORIGINAL_ARTICLE`, `ARTICLE_DESCR`, " + \ "`VENDOR_ARTICLE`, `SALES_LINE`, `CATEGORY_MARKETING_LINE`" + \ ") " + \ "AS SELECT " + \ "gender, group_article, brand, group_model, " + \ "rmh_product_division, rmh_gender, rmh_category, rmh_product_type, " + \ "business_segment, business_unit, colorway_name, season_active, " + \ "season_create, size_page, key_category, sub_brand, " + \ "corporate_marketing_line, product_division, order_locked, product_group, " + \ "product_type, sports_category, sourcing_size_scale, rmh_retail_class, " + \ "rmh_retail_department, rmh_retail_sub_class, rmh_retail_sub_dept, rmh_retail_section, " + \ "age_group, alternate_article, article_type, colorway_long_descr, " + \ "colorway_short_descr, lifecycle_status_date, original_article, article_descr, " + \ "vendor_article, SALES_LINE, category_marketing_line " + \ "FROM lake_out.bi_test_view" assert hive_view_ddl == expected_hive_ddl
def _report_success(self, name): success_subject = "Success for " + name logging.info(success_subject) Util.send_email(self._execution_system.config_service.emails, success_subject, success_subject)
def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1): # responses.add_passthru(self.default_server_url) destination_system = "bdp" destination_database = "emr_test" destination_environment = "dev" destination_active_table = "bi_test101" destination_changelog_table = "bi_test101_cl" load_type = "DeltaLoad" src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json" src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json" spark_external_parameters = '''{ "spark.driver.memory": "99G", "spark.executor.instances": "99", "spark.executor.memory": "90G" } ''' # pass desired content of tconx files for active and changelog tables to self.env_setup() src_tconx_content = py.path.local(src_tconx_path).read() src_tconx_cl_content = py.path.local(src_tconx_cl_table).read() m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \ self.env_setup( self.local_run_dir, destination_system, destination_database, destination_environment, destination_active_table, src_tconx_content, src_tconx_cl_content ) emr_system = EMRSystem(m3d_config_file, destination_system, destination_database, destination_environment) s3_table_active = S3Table(emr_system, destination_active_table) s3_table_changelog = S3Table(emr_system, destination_changelog_table) # Extract bucket names bucket_application = scon_emr_dict["environments"][ destination_environment]["s3_buckets"]["application"] # Put lake data for changelog table, this should be archived self.dump_data_to_s3( os.path.join(s3_table_changelog.dir_lake_final, "changelog.parquet"), "t|e|s|t|a|d|i|d|a|s|m|3|d|", ) M3D.load_table(m3d_config_file, destination_system, destination_database, destination_environment, destination_active_table, load_type, self.emr_cluster_id, spark_params=spark_external_parameters) filename_json = "delta_load-{environment}-{table}.json".format( environment=destination_environment, table=destination_active_table) # Checking configuration file for m3d-engine app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load) assert len(app_files) == 1 assert app_files[ 0] == s3_table_active.dir_apps_delta_load + filename_json delta_load_config_s3 = app_files[0] delta_load_config_content = self.get_object_content_from_s3( delta_load_config_s3) load_table_parameters = json.loads(delta_load_config_content) assert load_table_parameters[ "active_records_table_lake"] == s3_table_active.db_table_lake assert load_table_parameters[ "active_records_dir_lake"] == s3_table_active.dir_lake_final assert load_table_parameters[ "delta_records_file_path"] == s3_table_active.dir_landing_data assert load_table_parameters["technical_key"] == [ "m3d_timestamp", "datapakid", "partno", "record" ] assert load_table_parameters[ "business_key"] == s3_table_active.business_key if s3_table_active.partitioned_by in Util.defined_partitions: target_partitions = Util.get_target_partitions_list( s3_table_active.partitioned_by) else: target_partitions = s3_table_active.partitioned_by assert load_table_parameters["target_partitions"] == target_partitions assert load_table_parameters[ "partition_column"] == s3_table_active.partition_column assert load_table_parameters[ "partition_column_format"] == s3_table_active.partition_column_format # Check EMR steps. fake_cluster = self.mock_emr.backends[ self.default_aws_region].clusters[self.emr_cluster_id] assert 1 == len(fake_cluster.steps) expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join( scon_emr_dict["environments"][destination_environment] ["s3_deployment_dir_base"], destination_environment, scon_emr_dict["subdir"]["m3d"], m3d_config_dict["subdir_projects"]["m3d_api"], scon_emr_dict["spark"]["jar_name"]) delta_load_step = fake_cluster.steps[0] assert delta_load_step.jar == "command-runner.jar" assert delta_load_step.args[0] == "spark-submit" assert delta_load_step.args[ -5] == "com.adidas.analytics.AlgorithmFactory" assert delta_load_step.args[-4] == expected_algorithms_jar_path assert delta_load_step.args[-3] == "DeltaLoad" assert delta_load_step.args[-2] == delta_load_config_s3 assert delta_load_step.args[-1] == "s3" add_tags_patch_call_args_list = add_tags_patch.call_args_list assert len(add_tags_patch_call_args_list) == 1 assert sorted(add_tags_patch_call_args_list[0][0][0], key=lambda x: x["Key"]) == sorted([{ "Key": "ApiMethod", "Value": "load_table" }, { "Key": "LoadType", "Value": "DeltaLoad" }, { "Key": "TargetTable", "Value": "bi_test101" }], key=lambda x: x["Key"]) remove_json_patch.assert_called_once() assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
def setup_oracle_scon(config_dir, source_system, db_cd, base_scon_path, database_type=None): # Making sure that we can accept both strings as well as py.path.local objects. config_dir = py.path.local(str(config_dir)) config_system_dir = config_dir.join("system") config_credentials_dir = config_dir.join("credentials") if not config_system_dir.check(): config_system_dir.mkdir() if not config_credentials_dir.check(): config_credentials_dir.mkdir() oracle_docker_ip = os.getenv("ORACLE_DOCKER_IP", "") credentials_data = { "oracle_conn_string": { "lake": "LAKE/test_lake_password@%s:1521/XE" % oracle_docker_ip, "lake_out": "LAKE_OUT/test_lake_out_password@%s:1521/XE" % oracle_docker_ip, "m3d": "M3D/test_m3d_password@%s:1521/XE" % oracle_docker_ip, "mart_mod": "MART_MOD/test_mart_mod_password@%s:1521/XE" % oracle_docker_ip, "mart_cal": "MART_CAL/test_mart_cal_password@%s:1521/XE" % oracle_docker_ip, "mart_out": "MART_OUT/test_mart_out_password@%s:1521/XE" % oracle_docker_ip, "test_lake": "TEST_LAKE/test_lake_password@%s:1521/XE" % oracle_docker_ip, "test_lake_out": "TEST_LAKE_OUT/test_lake_out_password@%s:1521/XE" % oracle_docker_ip, "test_mart_mod": "TEST_MART_MOD/test_mart_mod_password@%s:1521/XE" % oracle_docker_ip, "test_mart_cal": "TEST_MART_CAL/test_mart_cal_password@%s:1521/XE" % oracle_docker_ip, "test_mart_out": "TEST_MART_OUT/test_mart_out_password@%s:1521/XE" % oracle_docker_ip, "dev_mart_mod": "DEV_MART_MOD/dev_mart_mod_password@%s:1521/XE" % oracle_docker_ip, "dev_mart_cal": "DEV_MART_CAL/dev_mart_cal_password@%s:1521/XE" % oracle_docker_ip, "dve_mart_out": "DEV_MART_OUT/dev_mart_out_password@%s:1521/XE" % oracle_docker_ip } } credentials_filename = "credentials-{}-{}.json".format( source_system, db_cd) credentials_file = config_credentials_dir.join(credentials_filename) credentials_file.write(json.dumps(credentials_data, indent=4)) scon_dict = Util.load_dict(base_scon_path) scon_dict["credentials"] = str(credentials_file) if database_type: scon_dict["database_type"] = database_type scon_filename = "scon-{}-{}.json".format(source_system, db_cd) scon_file = config_system_dir.join(scon_filename) scon_file.write(json.dumps(scon_dict, indent=4)) return str(scon_file), scon_dict