Python EMRSystem Exemples, m3d.hadoop.emr.emr_system.EMRSystem Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : m3d.py Projet : karok2m/m3d-api

    def delete_emr_cluster(config, destination_system, destination_database,
                           destination_environment, emr_cluster_id):
        from m3d.hadoop.emr.emr_system import EMRSystem
        emr = EMRSystem(config, destination_system, destination_database,
                        destination_environment)

        emr.delete_emr_cluster(emr_cluster_id)

Exemple #2

0

Afficher le fichier

Fichier : m3d.py Projet : karok2m/m3d-api

    def drop_table(config,
                   destination_system,
                   destination_database,
                   destination_environment,
                   destination_table,
                   emr_cluster_id=None):
        # create abstract table object to retrieve source technology
        abstract_table = Table(config, destination_system,
                               destination_database, destination_environment,
                               destination_table)
        destination_system_technology = abstract_table.get_destination_technology(
        )

        # hadoop
        if destination_system_technology == DataSystem.SystemTechnology.HIVE:
            if abstract_table.storage_type == DataSystem.StorageType.S3:
                from m3d.hadoop.emr.emr_system import EMRSystem
                emr_system = EMRSystem(config, destination_system,
                                       destination_database,
                                       destination_environment, emr_cluster_id)
                emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD,
                                           M3D.drop_table.__name__)
                emr_system.drop_table(destination_table)
            else:
                raise m3d_exceptions.M3DUnsupportedStorageException(
                    abstract_table.storage_type)
        else:
            raise m3d_exceptions.M3DUnsupportedDestinationSystemException(
                destination_system_technology)

Exemple #3

0

Afficher le fichier

Fichier : algorithm_executor_hadoop.py Projet : sbakiu/m3d-api

    def create(
            config_path,
            cluster_mode,
            destination_system,
            destination_database,
            destination_environment,
            algorithm_instance,
            ext_params_str
    ):
        data_system = DataSystem(
            config_path,
            cluster_mode,
            destination_system,
            destination_database,
            destination_environment
        )
        if data_system.database_type == DataSystem.DatabaseType.EMR:
            config = AlgorithmConfigurationHadoop.create_with_ext_params(
                config_path,
                cluster_mode,
                destination_database,
                destination_environment,
                algorithm_instance,
                ext_params_str
            )

            execution_system = EMRSystem.from_data_system(data_system, config.get_emr_cluster_id())
            return AlgorithmExecutorHadoop(execution_system, config)
        else:
            raise M3DUnsupportedDatabaseTypeException(data_system.database_type)

Exemple #4

0

Afficher le fichier

Fichier : m3d.py Projet : karok2m/m3d-api

 def add_emr_cluster_tags(config, destination_system, destination_database,
                          destination_environment, emr_cluster_id,
                          cluster_tags):
     from m3d.hadoop.emr.emr_system import EMRSystem
     emr_system = EMRSystem(config, destination_system,
                            destination_database, destination_environment,
                            emr_cluster_id)
     emr_system.add_cluster_tags(cluster_tags)

Exemple #5

0

Afficher le fichier

    def test_add_emr_cluster_tags(self):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        m3d_config_file, _, _, _, _ = self.env_setup(self.local_run_dir,
                                                     destination_system,
                                                     destination_database,
                                                     destination_environment,
                                                     destination_table)

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment,
                               self.emr_cluster_id)

        emr_system.add_cluster_tag("DataFormat", "csv")

        assert fake_cluster.tags == {"DataFormat": "csv"}

Exemple #6

0

Afficher le fichier

    def _create_emr_system(self):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "prod"

        m3d_config_file, _, _, _ = self.env_setup(self.local_run_dir,
                                                  destination_system,
                                                  destination_database,
                                                  destination_environment)
        return EMRSystem(m3d_config_file, destination_system,
                         destination_database, destination_environment,
                         self.emr_cluster_id)

Exemple #7

0

Afficher le fichier

    def test_add_emr_cluster_tags_multiple_calls(self):
        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, _, _, _ = self.env_setup(self.local_run_dir,
                                                     destination_system,
                                                     destination_database,
                                                     destination_environment,
                                                     destination_table)
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]
        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment,
                               self.emr_cluster_id)

        tags1 = {"DataFormat": "csv"}
        emr_system.add_cluster_tag("DataFormat", tags1["DataFormat"])

        tags2 = {"Database": "test_lake", "Table": destination_table}
        emr_system.add_cluster_tags(tags2)

        all_tags = tags1.copy()
        all_tags.update(tags2)
        assert fake_cluster.tags == all_tags

Exemple #8

0

Afficher le fichier

Fichier : m3d.py Projet : karok2m/m3d-api

    def drop_dataset(config,
                     destination_system,
                     destination_database,
                     destination_environment,
                     destination_dataset,
                     emr_cluster_id=None):
        from m3d.hadoop.emr.emr_system import EMRSystem
        emr_system = EMRSystem(config, destination_system,
                               destination_database, destination_environment,
                               emr_cluster_id)

        emr_system.add_cluster_tag(EMRSystem.EMRClusterTag.API_METHOD,
                                   M3D.drop_dataset.__name__)
        emr_system.drop_dataset(destination_dataset)

Exemple #9

0

Afficher le fichier

 def create(config_path, cluster_mode, destination_system,
            destination_database, destination_environment,
            destination_table, load_type, emr_cluster_id, spark_params_str):
     data_system = DataSystem(config_path, cluster_mode, destination_system,
                              destination_database, destination_environment)
     if data_system.database_type == DataSystem.DatabaseType.EMR:
         execution_system = EMRSystem.from_data_system(
             data_system, emr_cluster_id)
         spark_params_dict = json.loads(spark_params_str)
         return LoadExecutorHadoop(execution_system, load_type,
                                   destination_table, spark_params_dict)
     else:
         raise M3DUnsupportedDatabaseTypeException(
             data_system.database_type)

Exemple #10

0

Afficher le fichier

Fichier : test_load_table_append_s3.py Projet : sbakiu/m3d-api

    def __init__(self,
                 test_run_dir,
                 setup_function,
                 partition_columns,
                 regex_filename,
                 file_format=None,
                 null_value=None,
                 quote_character=None,
                 compute_table_statistics=None):
        self.config_file, _, self.tconx_file, self.config_dict, self.scon_emr_dict = setup_function(
            *([test_run_dir] + self.destination_params))

        self._write_acon(partition_columns, regex_filename, file_format,
                         null_value, quote_character, compute_table_statistics)
        self._write_tconx()

        self.table_config = [self.config_file, self.cluster_mode
                             ] + self.destination_params
        emr_system = EMRSystem(self.config_file, self.cluster_mode,
                               self.destination_system,
                               self.destination_database,
                               self.destination_environment)
        self.s3_table = S3Table(emr_system, self.destination_table)

        config_filename = "append_load-{}-{}.json".format(
            self.destination_environment, self.destination_table)
        self.config_filepath = os.path.join(self.s3_table.dir_apps_append_load,
                                            config_filename)
        self.db_name_lake = self.scon_emr_dict["environments"][
            self.destination_environment]["schemas"]["lake"]

        self.expected_algorithms_jar_path = "s3://" + os.path.join(
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_buckets"]["application"]).strip("/"),
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_deployment_dir_base"]).strip("/"),
            self.destination_environment, self.scon_emr_dict["subdir"]["m3d"],
            self.config_dict["subdir_projects"]["m3d_api"],
            self.scon_emr_dict["spark"]["jar_name"])

Exemple #11

0

Afficher le fichier

    def test_parses_basic_attributes_from_system_config_file(self, _):
        """
        Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member
        variables of EMRSystem object.
        """
        aws_api_credentials = AWSCredentials("fake_aws_api_access_key",
                                             "fake_aws_api_secret_key")
        aws_api_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-api.json")
        self.dump_aws_credentials(aws_api_credentials,
                                  str(aws_api_credentials_file))

        aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key",
                                                "fake_aws_s3_put_secret_key")
        aws_s3_put_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-s3_put.json")
        self.dump_aws_credentials(aws_s3_put_credentials,
                                  str(aws_s3_put_credentials_file))

        aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key",
                                                "fake_aws_s3_del_secret_key")
        aws_s3_del_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-s3_del.json")
        self.dump_aws_credentials(aws_s3_del_credentials,
                                  str(aws_s3_del_credentials_file))

        test_scon_json = TestEMRSystem.test_scon_json_template.format(
            aws_api_credentials=str(aws_api_credentials_file),
            aws_s3_put_credentials=str(aws_s3_put_credentials_file),
            aws_s3_del_credentials=str(aws_s3_del_credentials_file))

        s3_scon_file = self.local_run_dir.join("scon-emr-emr-test.json")
        s3_scon_file.write(test_scon_json)

        MockConfigService.scon_path = str(s3_scon_file)

        emr_system = EMRSystem(*self.test_emr_system_arguments)

        expected_system_params = {
            "bucket_landing":
            "m3d-da-bdp-test-landing",
            "bucket_lake":
            "m3d-da-bdp-test-lake",
            "bucket_mart_cal":
            "m3d-da-bdp-test-mart-cal",
            "bucket_log":
            "io.3stripes.factory.test.ireland.infrastructure-logs",
            "default_ebs_size":
            "128",
            "default_emr_version":
            "emr-5.17.0",
            "aws_api_credentials":
            aws_api_credentials,
            "aws_s3_put_credentials":
            aws_s3_put_credentials,
            "aws_s3_del_credentials":
            aws_s3_del_credentials,
            "api_action_timeout_seconds":
            120,
            "api_action_polling_interval_seconds":
            3,
            "api_long_timeout_seconds":
            300,
            "aws_region":
            "eu-west-1",
            "packages_to_deploy": ["hadoop"],
            "configs_to_deploy": ["test_config_1", "test_config_2"],
            "subdir_archive":
            "test_archive/",
            "subdir_header":
            "test_header/",
            "subdir_config":
            "test_config/",
            "subdir_data":
            "test_data/",
            "subdir_delta_table":
            "delta_table/",
            "subdir_data_backup":
            "data_backup/",
            "subdir_error":
            "test_error/",
            "subdir_work":
            "test_work/",
            "subdir_log":
            "test_log/",
            "subdir_apps":
            "test_apps/",
            "subdir_m3d_engine":
            "test_m3d_engine/",
            "subdir_loading":
            "test_loading/",
            "subdir_full_load":
            "test_full_load/",
            "subdir_delta_load":
            "test_delta_load/",
            "subdir_delta_lake_load":
            "test_delta_lake_load/",
            "subdir_append_load":
            "test_append_load/",
            "subdir_black_whole":
            "test_black_whole/",
            "subdir_credentials":
            "test_credentials/",
            "subdir_keytab":
            "test_keytab/",
            "subdir_tmp":
            "test_tmp/",
            "subdir_code":
            "m3d",
            "subdir_metadata":
            "metadata",
            "spark_jar_name":
            "test_jar.jar",
            "dir_apps":
            "s3://m3d-da-landing-application/m3d-test/test_environment/test_apps/",
            "dir_apps_algorithm":
            "s3://m3d-da-landing-application/m3d-test/"
            "test_environment/test_apps/test_m3d_engine/",
            "dir_apps_loading":
            "s3://m3d-da-landing-application/m3d-test/test_environment/"
            "test_apps/test_loading/",
            "dir_tmp_s3":
            "s3://m3d-da-landing-application/m3d-test/test_environment/test_tmp/",
            "dir_tmp_local":
            "/test_tmp/",
            "spark_jar_path":
            "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/" +
            "test_subdir_projects_m3d_api/test_jar.jar",
            "dir_m3d_api_deployment":
            "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/test_subdir_projects_m3d_api",
            "dir_metadata_deployment":
            "s3://m3d-da-landing-application/m3d-test/test_environment/metadata/test_subdir_projects_m3d_api"
        }

        for param in expected_system_params.keys():
            assert getattr(emr_system, param) == expected_system_params[param]

Exemple #12

0

Afficher le fichier

    def test_lakeout_view_hql(self, add_tags_patch):
        tconx_src_path = "test/resources/test_create_out_view_hive/test_lakeout_view_structure/config/tconx.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        m3d_config_file, _, tconx_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        # Use test case specific tconx
        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file,
            destination_system,
            destination_database,
            destination_environment,
            destination_table
        ]

        table_config_kwargs = {
            "emr_cluster_id": self.emr_cluster_id
        }

        emr_steps_completer = self.create_emr_steps_completer(expected_steps_count=1, timeout_seconds=3)

        with ConcurrentExecutor(emr_steps_completer, delay_sec=0.4):
            logging.info("Calling M3D.create_out_view().")
            M3D.create_out_view(*table_config, **table_config_kwargs)

        emr_system = EMRSystem(*table_config[:5])
        s3_table = S3Table(emr_system, destination_table)

        mock_cluster = self.mock_emr.backends[self.default_aws_region].clusters[self.emr_cluster_id]
        assert 1 == len(mock_cluster.steps)

        hive_step = mock_cluster.steps[0]

        assert hive_step.args[0] == "hive"
        assert hive_step.args[1] == "--silent"
        assert hive_step.args[2] == "-f"

        actual_hql_content_in_bucket = self.get_object_content_from_s3(hive_step.args[3])

        column_name_pairs = [
            ("record_date", "v_record_date"),
            ("p_string", "v_string"),
            ("p_int", "v_int"),
            ("p_bigint", "v_bigint"),
            ("p_float", "v_float"),
            ("p_varchar_1", "v_varchar_10"),
            ("p_varchar_2", "v_varchar_100"),
            ("p_char_1", "v_char"),
            ("p_boolean", "v_boolean"),
            ("year", "year"),
            ("month", "month")
        ]
        columns_str = ", ".join(map(lambda x: "{} AS {}".format(x[0], x[1]), column_name_pairs))

        drop_view = "DROP VIEW IF EXISTS {};".format(s3_table.db_view_lake_out)

        # S3Table is partitioned by year and month
        create_view = "\n".join([
            "CREATE VIEW {}".format(s3_table.db_view_lake_out),
            "AS",
            "SELECT {}".format(columns_str),
            "FROM {};".format(s3_table.db_table_lake)
        ])

        expected_hql = "\n".join([drop_view, create_view])

        assert actual_hql_content_in_bucket == expected_hql

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 2
        assert add_tags_patch_call_args_list[0][0][0] == [{
            "Key": "ApiMethod",
            "Value": "create_out_view"
        }]
        assert add_tags_patch_call_args_list[1][0][0] == [{
            "Key": "TargetView",
            "Value": "dev_lake_out.bi_test101"
        }]

Exemple #13

0

Afficher le fichier

Fichier : test_load_table_full_s3.py Projet : visionarylab/m3d-api

    def test_full_load_emr(self, _0, _1):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id, spark_external_parameters
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]
        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config)

        # Since we have offloaded data move operations to EMR Steps dir_landing_final will still have
        # old files in it and dir_landing_archive will not have new files
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        # Check args of spark-submit EMR step
        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"

Exemple #14

0

Afficher le fichier

Fichier : test_load_table_full_s3.py Projet : visionarylab/m3d-api

    def test_full_load_emr_external_spark_parameters(self, _0):

        tconx_src_path = \
            "test/resources/test_create_out_view_hive/test_empty_table_lakeout/config/empty_tabl_cd_lakeout.json"
        acon_src_path = "test/resources/test_load_table_full_s3/acon-emr_test-bi_test101.json"

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_table = "bi_test101"

        spark_external_parameters = {
            "spark.driver.memory": "99G",
            "spark.executor.instances": "99",
            "spark.executor.memory": "90G"
        }

        load_type = "FullLoad"
        landing_dataset = "landing-dataset.psv"

        m3d_config_file, scon_emr_file, tconx_file, m3d_config_dict, scon_emr_dict = \
            super(TestLoadTableFullS3, self).env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_table
            )
        AconHelper.setup_acon_from_file(m3d_config_dict["tags"]["config"],
                                        destination_database,
                                        destination_environment,
                                        destination_table, acon_src_path)

        py.path.local(tconx_file).write(py.path.local(tconx_src_path).read())

        table_config = [
            m3d_config_file, destination_system, destination_database,
            destination_environment, destination_table, load_type,
            self.emr_cluster_id
        ]

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        test_s3_table = S3Table(emr_system, destination_table)

        # Put landing data
        self.dump_data_to_s3(
            os.path.join(test_s3_table.dir_landing_final, landing_dataset),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|")

        M3D.load_table(*table_config,
                       spark_params=json.dumps(spark_external_parameters))

        # psv file will still be in landing since move operation should be
        # performed by EMR Step which we mock here. Accordingly archive will
        # still be empty.
        landing_files = self.get_child_objects(test_s3_table.dir_landing_final)
        assert len(landing_files) == 1
        assert landing_files[0] == os.path.join(
            test_s3_table.dir_landing_final, landing_dataset)

        landing_archive_files = self.get_child_objects(
            test_s3_table.dir_landing_archive)
        assert len(landing_archive_files) == 0

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        spark_step = fake_cluster.steps[0]

        assert spark_step.jar == "command-runner.jar"
        assert spark_step.args[0] == "spark-submit"
        assert spark_step.args[5] == "--conf"
        assert spark_step.args[7] == "--conf"
        assert spark_step.args[9] == "--conf"

        expected_spark_conf_options = set(
            map(lambda p: "{}={}".format(p[0], p[1]),
                spark_external_parameters.items()))
        actual_spark_conf_options = set(
            map(lambda x: spark_step.args[x], [6, 8, 10]))
        assert expected_spark_conf_options == actual_spark_conf_options

        assert spark_step.args[-5] == "com.adidas.analytics.AlgorithmFactory"
        assert spark_step.args[-4] == expected_algorithms_jar_path
        assert spark_step.args[-3] == "FullLoad"
        assert spark_step.args[-2] == "s3://m3d-dev-application/m3d/dev/apps/loading/bdp/test101/" \
                                      "full_load/full_load-dev-bi_test101.json"
        assert spark_step.args[-1] == "s3"

Exemple #15

0

Afficher le fichier

Fichier : test_load_table_delta_s3.py Projet : karok2m/m3d-api

    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=spark_external_parameters)

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted([{
                          "Key": "ApiMethod",
                          "Value": "load_table"
                      }, {
                          "Key": "LoadType",
                          "Value": "DeltaLoad"
                      }, {
                          "Key": "TargetTable",
                          "Value": "bi_test101"
                      }],
                                                        key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]

Exemple #16

0

Afficher le fichier

Fichier : test_load_table_append_s3.py Projet : visionarylab/m3d-api

    def __init__(self,
                 test_run_dir,
                 setup_function,
                 target_partitions,
                 regex_filename,
                 file_format=None,
                 null_value=None,
                 quote_character=None,
                 compute_table_statistics=None,
                 schema=None,
                 verify_schema=None,
                 data_type=None,
                 reader_mode=None,
                 metadata_update_strategy=None):

        destination_params = [
            self.destination_system, self.destination_database,
            self.destination_environment, self.destination_table
        ]
        self.config_file, _, self.tconx_file, self.config_dict, self.scon_emr_dict = setup_function(
            *([test_run_dir] + destination_params))

        self._write_acon(target_partitions,
                         regex_filename,
                         file_format=file_format,
                         null_value=null_value,
                         quote_character=quote_character,
                         compute_table_statistics=compute_table_statistics,
                         schema=schema,
                         verify_schema=verify_schema,
                         data_type=data_type,
                         reader_mode=reader_mode,
                         metadata_update_strategy=metadata_update_strategy)
        self._write_tconx()

        self.table_config = [self.config_file] + destination_params
        emr_system = EMRSystem(self.config_file, self.destination_system,
                               self.destination_database,
                               self.destination_environment)

        # self.s3_table = S3Table(emr_system, self.destination_table)
        if data_type is None:
            data_type = DataType.STRUCTURED

        self.dataset = DataSetFactory.create_dataset(
            emr_system, HiveTable.TableLoadType.APPEND, data_type,
            self.destination_table)

        config_filename = "append_load-{}-{}.json".format(
            self.destination_environment, self.dataset.table_lake)
        self.config_filepath = os.path.join(self.dataset.dir_apps_append_load,
                                            config_filename)
        self.db_name_lake = self.scon_emr_dict["environments"][
            self.destination_environment]["schemas"]["lake"]

        self.expected_algorithms_jar_path = "s3://" + os.path.join(
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_buckets"]["application"]).strip("/"),
            (self.scon_emr_dict["environments"][self.destination_environment]
             ["s3_deployment_dir_base"]).strip("/"),
            self.destination_environment, self.scon_emr_dict["subdir"]["m3d"],
            self.config_dict["subdir_projects"]["m3d_api"],
            self.scon_emr_dict["spark"]["jar_name"])