Exemple #1
0
def test_framework_invalid(descriptor_as_adict, descriptor_config):
    descriptor_config.valid_frameworks = ["foo"]
    descriptor_as_adict.ml.framework = "bar"

    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(),
                                      descriptor_config)
Exemple #2
0
def test_framework_version(descriptor_as_adict):
    descriptor_as_adict.ml.framework = "mxnet"
    descriptor_as_adict.ml.framework_version = "1.0"

    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.ml.framework_version == "1.0"
def create_job_yaml_spec(
    descriptor_contents: Dict[str, str],
    executor_config: ExecutorConfig,
    fetched_data_sources: List[DownloadableContent],
    fetched_models: List[DownloadableContent],
    scripts: List[FileSystemObject],
    job_id: str,
    *,
    event: BenchmarkEvent,
    extra_bai_config_args=None,
) -> str:
    """
    Creates the YAML spec file corresponding to a descriptor passed as parameter
    :param event: event that triggered this execution
    :param descriptor_contents: dict containing the parsed descriptor
    :param executor_config: configuration for the transpiler
    :param fetched_data_sources: list of fetched data sources, as generated by the fetcher
    :param fetched_models: list of fetched models, as generated by the fetcher
    :param scripts: list of supplied scripts
    :param job_id: str
    :param extra_bai_config_args: An optional Dict which will be forwarded to the `BaiConfig` object created
    :return: Tuple with (yaml string for the given descriptor, job_id)
    """
    descriptor = BenchmarkDescriptor.from_dict(
        descriptor_contents, executor_config.descriptor_config)

    bai_k8s_benchmark_job_builder = create_single_run_benchmark_bai_k8s_builder(
        descriptor,
        executor_config.bai_config,
        fetched_data_sources,
        scripts,
        job_id,
        event=event,
        environment_info=executor_config.environment_info,
        extra_bai_config_args=extra_bai_config_args,
    )

    if descriptor.hardware.strategy != DistributedStrategy.INFERENCE:
        return bai_k8s_benchmark_job_builder.dump_yaml_string()

    bai_k8s_inference_server_job_builder = create_inference_server_bai_k8s_builder(
        descriptor,
        executor_config.bai_config,
        job_id,
        fetched_models,
        event=event,
        environment_info=executor_config.environment_info,
        extra_bai_config_args=extra_bai_config_args,
    )

    return (f"{bai_k8s_benchmark_job_builder.dump_yaml_string()}"
            f"---\n"
            f"{bai_k8s_inference_server_job_builder.dump_yaml_string()}")
def create_scheduled_job_yaml_spec(descriptor_contents: Dict,
                                   executor_config: ExecutorConfig,
                                   job_id: str, event: BenchmarkEvent) -> str:
    """
    Creates the YAML spec file corresponding to a descriptor passed as parameter
    :param event: event that triggered this execution
    :param descriptor_contents: dict containing the parsed descriptor
    :param executor_config: configuration for the transpiler
    :param job_id: str
    :return: Tuple with (yaml string for the given descriptor, job_id)
    """
    descriptor = BenchmarkDescriptor.from_dict(
        descriptor_contents, executor_config.descriptor_config)
    bai_k8s_builder = create_scheduled_benchmark_bai_k8s_builder(
        descriptor, executor_config.bai_config, job_id, event=event)
    return bai_k8s_builder.dump_yaml_string()
Exemple #5
0
    def run(self, event: FetcherBenchmarkEvent) -> BenchmarkJob:
        logger.info(f"Processing SageMaker benchmark {event.action_id}")
        try:
            descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents, CONFIG)
        except DescriptorError as e:
            logger.exception("Could not parse descriptor %s", e)
            raise ExecutionEngineException("Cannot process the request") from e

        with tempfile.TemporaryDirectory(prefix=self.config.tmp_sources_dir) as tmpdirname:
            ScriptSourceDirectory.create(descriptor, tmpdirname, event.payload.scripts)

            session = self.session_factory()
            try:
                estimator = self.estimator_factory(session, descriptor, tmpdirname, self.config)
            except Exception as e:
                logger.exception("Could not create estimator %s", e)
                raise ExecutionEngineException("Cannot create estimator") from e

            # Estimate the total size
            total_size_gb = self._estimate_total_gb(event)
            estimator.train_volume_size = max(estimator.train_volume_size, total_size_gb)

            data = self._get_estimator_data(event)

            try:
                job_name = SageMakerExecutionEngine._get_job_name(event.action_id)
                merge = False
                if descriptor.custom_params and descriptor.custom_params.sagemaker_job_name:
                    job_name = descriptor.custom_params.sagemaker_job_name
                if descriptor.custom_params and descriptor.custom_params.merge:
                    merge = descriptor.custom_params.merge
                logger.info(f"Attempting to start training job {job_name}")
                if merge:
                    estimator.fit(data, wait=True, logs=False, job_name=job_name)
                    self.merge_metrics(descriptor)
                else:
                    estimator.fit(data, wait=False, logs=False, job_name=job_name)
            except botocore.exceptions.ClientError as err:
                error_message = get_client_error_message(err, default="Unknown")
                raise ExecutionEngineException(
                    f"Benchmark creation failed. SageMaker returned error: {error_message}"
                ) from err
            except Exception as err:
                logger.exception("Caught unexpected exception", err)
                raise err
            return BenchmarkJob(id=estimator.latest_training_job.name)
    def handle_event(self, event: ExecutorBenchmarkEvent,
                     kafka_service: KafkaService):
        job_id = event.payload.job.id
        if job_id in self.watchers:
            # This shouldn't happen, so it is here more as a protection mechanism
            logger.warning("There is already a watcher for job '%s'", job_id)
            return

        descriptor = BenchmarkDescriptor.from_dict(event.payload.toml.contents)
        if descriptor.hardware.strategy not in [
                DistributedStrategy.SINGLE_NODE, DistributedStrategy.INFERENCE
        ]:
            logger.info(f"Unsupported strategy {descriptor.hardware.strategy}")
            kafka_service.send_status_message_event(
                event, Status.PENDING,
                f"'{descriptor.hardware.strategy.value}' strategy is not currently supported."
            )
            return

        logger.info("Starting to watch the job '%s'", job_id)

        watcher_callback = self._make_status_callback(
            event, kafka_service, not self._is_sage_maker_job(event))
        if self._is_sage_maker_job(event):
            watcher = SageMakerTrainingJobWatcher(
                job_id=job_id,
                callback=watcher_callback,
                sagemaker_client=boto3.client("sagemaker"))
            kafka_service.send_status_message_event(
                event, Status.PENDING, "Watching SageMaker benchmark")
        else:
            watcher = KubernetesJobWatcher(
                job_id,
                watcher_callback,
                kubernetes_client_jobs=kubernetes.client.BatchV1Api(),
                kubernetes_client_pods=kubernetes.client.CoreV1Api(),
                kubernetes_namespace=self.config.
                kubernetes_namespace_of_running_jobs,
            )
            kafka_service.send_status_message_event(
                event, Status.PENDING, "Watching Kubernetes benchmark")
        self.watchers[job_id] = watcher
        watcher.start()
Exemple #7
0
def descriptor(descriptor_as_dict):
    return BenchmarkDescriptor.from_dict(descriptor_as_dict)
Exemple #8
0
def test_distributed_default(descriptor_as_adict):
    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.hardware.processes_per_instance == 1
Exemple #9
0
def test_invalid_args_type(descriptor_as_adict):
    descriptor_as_adict.ml.args = 4
    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #10
0
def test_distributed_gpus(descriptor_as_adict):
    descriptor_as_adict.hardware.instance_type = "p3.8xlarge"
    descriptor_as_adict.hardware.distributed.processes_per_instance = ONE_PER_GPU
    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.hardware.processes_per_instance == 4
Exemple #11
0
def test_invalid_scheduling(descriptor_as_adict, scheduling):
    descriptor_as_adict.info.scheduling = scheduling
    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #12
0
def test_invalid_custom_labels(descriptor_as_adict, labels):
    descriptor_as_adict.info.labels = labels
    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #13
0
def test_invalid_strategy(descriptor_as_adict):
    descriptor_as_adict.hardware.strategy = "foo"
    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #14
0
def test_framework_explicit(descriptor_as_adict):
    descriptor_as_adict.ml.framework = "mxnet"

    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.ml.framework == MLFramework.MXNET
Exemple #15
0
def customparams_descriptor(descriptor_config, descriptor_customparams_as_adict):
    return BenchmarkDescriptor.from_dict(descriptor_customparams_as_adict.to_dict(), descriptor_config)
Exemple #16
0
def test_framework_version_no_framework(descriptor_as_adict):
    descriptor_as_adict.ml.framework = ""
    descriptor_as_adict.ml.framework_version = "1.0"

    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #17
0
def test_distributed_gpus_on_cpu(descriptor_as_adict):
    descriptor_as_adict.hardware.instance_type = "t2.small"
    descriptor_as_adict.hardware.distributed.processes_per_instance = ONE_PER_GPU

    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #18
0
def test_script_file_required(descriptor_as_adict, script_value):
    descriptor_as_adict.ml.script.script = script_value

    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
Exemple #19
0
def test_framework_required(descriptor_as_adict, descriptor_config):
    descriptor_config.valid_frameworks = ["foo"]

    with pytest.raises(DescriptorError):
        BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(),
                                      descriptor_config)
Exemple #20
0
def test_distributed_explicit(descriptor_as_adict):
    descriptor_as_adict.hardware.distributed.processes_per_instance = "4"
    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.hardware.processes_per_instance == 4
Exemple #21
0
def test_valid_strategy(descriptor_as_adict):
    descriptor_as_adict.hardware.strategy = "horovod"

    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())
    assert descriptor.hardware.strategy == DistributedStrategy.HOROVOD
Exemple #22
0
def descriptor(descriptor_config, descriptor_as_adict):
    return BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict(), descriptor_config)
Exemple #23
0
def test_distributed_num_instances_default(descriptor_as_adict):
    descriptor = BenchmarkDescriptor.from_dict(descriptor_as_adict.to_dict())

    assert descriptor.hardware.distributed.num_instances == 2