Ejemplo n.º 1
0
def test_environment():
    toolkit = Toolkit(software="spark",
                      version="2.2.0",
                      environment="miniconda")
    assert toolkit.software == "spark"
    assert toolkit.version == "2.2.0"
    assert toolkit.environment == "miniconda"
Ejemplo n.º 2
0
    def _merge_dict(self, config):
        config = config.get('job')

        if config.get('id') is not None:
            self.id = config['id']

        cluster_configuration = config.get('cluster_configuration')
        if cluster_configuration:
            self.vm_size = cluster_configuration.get('vm_size')
            self.toolkit = Toolkit.from_dict(
                cluster_configuration.get('toolkit'))
            if cluster_configuration.get('size') is not None:
                self.max_dedicated_nodes = cluster_configuration.get('size')
            if cluster_configuration.get('size_low_priority') is not None:
                self.max_low_pri_nodes = cluster_configuration.get(
                    'size_low_priority')
            self.custom_scripts = cluster_configuration.get('custom_scripts')
            self.subnet_id = cluster_configuration.get('subnet_id')
            self.worker_on_master = cluster_configuration.get(
                "worker_on_master")
            scheduling_target = cluster_configuration.get("scheduling_target")
            if scheduling_target:
                self.scheduling_target = SchedulingTarget(scheduling_target)

        applications = config.get('applications')
        if applications:
            self.applications = []
            for application in applications:
                self.applications.append(
                    aztk.spark.models.ApplicationConfiguration(
                        name=application.get('name'),
                        application=application.get('application'),
                        application_args=application.get('application_args'),
                        main_class=application.get('main_class'),
                        jars=application.get('jars'),
                        py_files=application.get('py_files'),
                        files=application.get('files'),
                        driver_java_options=application.get(
                            'driver_java_options'),
                        driver_library_path=application.get(
                            'driver_library_path'),
                        driver_class_path=application.get('driver_class_path'),
                        driver_memory=application.get('driver_memory'),
                        executor_memory=application.get('executor_memory'),
                        driver_cores=application.get('driver_cores'),
                        executor_cores=application.get('executor_cores')))

        spark_configuration = config.get('spark_configuration')
        if spark_configuration:
            self.spark_defaults_conf = self.__convert_to_path(
                spark_configuration.get('spark_defaults_conf'))
            self.spark_env_sh = self.__convert_to_path(
                spark_configuration.get('spark_env_sh'))
            self.core_site_xml = self.__convert_to_path(
                spark_configuration.get('core_site_xml'))
            self.jars = [
                self.__convert_to_path(jar)
                for jar in spark_configuration.get('jars') or []
            ]
Ejemplo n.º 3
0
def cluster_config_from_dict(config: dict):
    output = ClusterConfiguration()
    wait = False
    if config.get('id') is not None:
        output.cluster_id = config['id']

    if config.get('vm_size') is not None:
        output.vm_size = config['vm_size']

    if config.get('size'):
        output.vm_count = config['size']

    if config.get('size_low_pri'):
        output.vm_low_pri_count = config['size_low_pri']

    if config.get('subnet_id') is not None:
        output.subnet_id = config['subnet_id']

    if config.get('username') is not None:
        output.user_configuration = UserConfiguration(
            username=config['username'])

        if config.get('password') is not None:
            output.user_configuration.password = config['password']

    if config.get('custom_scripts') not in [[None], None]:
        output.custom_scripts = []
        for custom_script in config['custom_scripts']:
            output.custom_scripts.append(
                aztk.spark.models.CustomScript(script=custom_script['script'],
                                               run_on=custom_script['runOn']))

    if config.get('azure_files') not in [[None], None]:
        output.file_shares = []
        for file_share in config['azure_files']:
            output.file_shares.append(
                aztk.spark.models.FileShare(
                    storage_account_name=file_share['storage_account_name'],
                    storage_account_key=file_share['storage_account_key'],
                    file_share_path=file_share['file_share_path'],
                    mount_path=file_share['mount_path'],
                ))

    if config.get('toolkit') is not None:
        output.toolkit = Toolkit.from_dict(config['toolkit'])

    if config.get('plugins') not in [[None], None]:
        output.plugins = []
        for plugin in config['plugins']:
            ref = PluginReference.from_dict(plugin)
            output.plugins.append(ref.get_plugin())

    if config.get('worker_on_master') is not None:
        output.worker_on_master = config['worker_on_master']

    if config.get('wait') is not None:
        wait = config['wait']

    return output, wait
Ejemplo n.º 4
0
    def _merge_dict(self, config):
        config = config.get("job")

        if config.get("id") is not None:
            self.id = config["id"]

        cluster_configuration = config.get("cluster_configuration")
        if cluster_configuration:
            self.vm_size = cluster_configuration.get("vm_size")
            self.toolkit = Toolkit.from_dict(
                cluster_configuration.get("toolkit"))
            if cluster_configuration.get("size") is not None:
                self.max_dedicated_nodes = cluster_configuration.get("size")
            if cluster_configuration.get("size_low_priority") is not None:
                self.max_low_pri_nodes = cluster_configuration.get(
                    "size_low_priority")
            self.subnet_id = cluster_configuration.get("subnet_id")
            self.worker_on_master = cluster_configuration.get(
                "worker_on_master")
            scheduling_target = cluster_configuration.get("scheduling_target")
            if scheduling_target:
                self.scheduling_target = SchedulingTarget(scheduling_target)

        applications = config.get("applications")
        if applications:
            self.applications = []
            for application in applications:
                self.applications.append(
                    aztk.spark.models.ApplicationConfiguration(
                        name=application.get("name"),
                        application=application.get("application"),
                        application_args=application.get("application_args"),
                        main_class=application.get("main_class"),
                        jars=application.get("jars"),
                        py_files=application.get("py_files"),
                        files=application.get("files"),
                        driver_java_options=application.get(
                            "driver_java_options"),
                        driver_library_path=application.get(
                            "driver_library_path"),
                        driver_class_path=application.get("driver_class_path"),
                        driver_memory=application.get("driver_memory"),
                        executor_memory=application.get("executor_memory"),
                        driver_cores=application.get("driver_cores"),
                        executor_cores=application.get("executor_cores"),
                    ))

        spark_configuration = config.get("spark_configuration")
        if spark_configuration:
            self.spark_defaults_conf = _convert_to_path(
                spark_configuration.get("spark_defaults_conf"))
            self.spark_env_sh = _convert_to_path(
                spark_configuration.get("spark_env_sh"))
            self.core_site_xml = _convert_to_path(
                spark_configuration.get("core_site_xml"))
            self.jars = [
                _convert_to_path(jar)
                for jar in spark_configuration.get("jars") or []
            ]
Ejemplo n.º 5
0
def execute(args: typing.NamedTuple):
    if not args.toolkit_software:
        return print_available_softwares()

    if not validate_software(args.toolkit_software):
        return None

    if not args.version:
        return print_available_software_version(args.toolkit_software)
    if not args.environment:
        print_available_environments(args.toolkit_software)

    toolkit = Toolkit(software=args.toolkit_software,
                      version=args.version,
                      environment=args.environment)

    toolkit.validate()
    log.info("Docker image picked for this toolkit: %s",
             toolkit.get_docker_repo(args.gpu))
    return None
Ejemplo n.º 6
0
def test_scheduling_target_dedicated_with_no_dedicated_nodes_raise_error():
    with pytest.raises(InvalidModelError, match="Scheduling target cannot be Dedicated if dedicated vm size is 0"):
        conf = ClusterConfiguration(
            cluster_id="abc",
            scheduling_target=SchedulingTarget.Dedicated,
            vm_size="standard_a2",
            size=0,
            size_low_priority=2,
            toolkit=Toolkit(software="spark", version="1.6.3"),
        )

        conf.validate()
Ejemplo n.º 7
0
def test_valid_software_and_version():
    Toolkit(software="spark", version="2.2.0").validate()
Ejemplo n.º 8
0
def test_missing_version_raise_error():
    with pytest.raises(InvalidModelError):
        Toolkit(software="spark", version=None).validate()
Ejemplo n.º 9
0
def test_basic_toolkit():
    toolkit = Toolkit(software="spark", version="2.2.0")
    assert toolkit.software == "spark"
    assert toolkit.version == "2.2.0"
Ejemplo n.º 10
0
def test_get_right_docker_repo_with_env_for_gpu():
    repo = Toolkit(software="spark", version="2.2.0",
                   environment="miniconda").get_docker_repo(True)

    assert repo == "aztk/spark:v{0}-spark2.2.0-miniconda-gpu".format(
        docker_image_version)
Ejemplo n.º 11
0
def test_get_right_docker_repo():
    repo = Toolkit(software="spark", version="2.2.0").get_docker_repo(False)

    assert repo == "aztk/spark:v{0}-spark2.2.0-base".format(
        docker_image_version)
Ejemplo n.º 12
0
def test_get_right_docker_repo_for_gpu():
    repo = Toolkit(software="spark", version="2.1.0").get_docker_repo(True)

    assert repo == "aztk/spark:v{0}-spark2.1.0-gpu".format(
        docker_image_version)
Ejemplo n.º 13
0
def test_invalid_environment_version_raise_error():
    with pytest.raises(InvalidModelError):
        Toolkit(software="spark",
                version="2.2.0",
                environment="miniconda",
                environment_version="7.1.9").validate()
Ejemplo n.º 14
0
def test_invalid_version_raise_error():
    with pytest.raises(InvalidModelError):
        Toolkit(software="spark", version="780.0").validate()
Ejemplo n.º 15
0
def test_valid_software_version_and_environment():
    Toolkit(software="spark", version="2.2.0",
            environment="miniconda").validate()
Ejemplo n.º 16
0
def test_invalid_software_raise_error():
    with pytest.raises(InvalidModelError):
        Toolkit(software="non-supported", version="2.2.0").validate()
Ejemplo n.º 17
0
def test_missing_software_raise_error():
    with pytest.raises(InvalidModelError):
        Toolkit(software=None, version="2.2.0").validate()