def test_set_yarn_spark_resource_config_fallback( patched_virtual_memory, patched_cpu_count, patched_yarn_config, patched_spark_config, default_bootstrapper: Bootstrapper, ) -> None: mocked_virtual_memory_total = PropertyMock(return_value=123 * 1024 * 1024) type(patched_virtual_memory.return_value ).total = mocked_virtual_memory_total patched_cpu_count.return_value = 456 default_bootstrapper.load_processing_job_config = MagicMock( return_value=None) default_bootstrapper.load_instance_type_info = MagicMock(return_value=None) default_bootstrapper.get_yarn_spark_resource_config = MagicMock( return_value=(patched_yarn_config, patched_spark_config)) default_bootstrapper.set_yarn_spark_resource_config() patched_virtual_memory.assert_called_once() mocked_virtual_memory_total.assert_called_once() patched_cpu_count.assert_called_once() default_bootstrapper.load_processing_job_config.assert_called_once() default_bootstrapper.load_instance_type_info.assert_called_once() default_bootstrapper.get_yarn_spark_resource_config.assert_called_once_with( 1, 123, 456) patched_yarn_config.write_config.assert_called_once() patched_spark_config.write_config.assert_called_once()
def test_set_regional_configs(patched_config, default_bootstrapper: Bootstrapper) -> None: default_bootstrapper.get_regional_configs = MagicMock( return_value=[patched_config]) default_bootstrapper.set_regional_configs() default_bootstrapper.get_regional_configs.assert_called_once() patched_config.write_config.assert_called_once()
def test_start_hadoop_daemons_on_worker(patched_popen, patched_call) -> None: worker_bootstrapper = Bootstrapper(resource_config={"current_host": "algo-2", "hosts": ["algo-1", "algo-2"]}) worker_bootstrapper.start_hadoop_daemons() expected_subprocess_calls = [ call("rm -rf /opt/amazon/hadoop/hdfs/datanode && mkdir -p /opt/amazon/hadoop/hdfs/datanode", shell=True,), ] patched_call.call_args_list = expected_subprocess_calls expected_subprocess_popens = [ call("hdfs datanode", shell=True), call("yarn nodemanager", shell=True), ] patched_popen.call_args_list == expected_subprocess_popens
def test_get_regional_configs_gov(patched_getenv, default_bootstrapper: Bootstrapper) -> None: patched_getenv.return_value = "us-gov-west-1" regional_configs_list = default_bootstrapper.get_regional_configs() assert len(regional_configs_list) == 1 assert regional_configs_list[0] == Configuration( Classification="core-site", Properties={"fs.s3a.endpoint": "s3.us-gov-west-1.amazonaws.com"} ) patched_getenv.assert_called_once_with("AWS_REGION")
def test_load_processing_job_config(patched_exists, default_bootstrapper: Bootstrapper) -> None: exp_config = {"ProcessingResources": {"ClusterConfig": {"InstanceType": "foo.xbar", "InstanceCount": 123}}} patched_exists.return_value = True with patch("smspark.bootstrapper.open", mock_open(read_data=json.dumps(exp_config))) as m: actual_config = default_bootstrapper.load_processing_job_config() assert actual_config == exp_config patched_exists.assert_called_once_with(Bootstrapper.PROCESSING_JOB_CONFIG_PATH) m.assert_called_once_with(Bootstrapper.PROCESSING_JOB_CONFIG_PATH, "r")
def test_set_yarn_spark_resource_config( patched_yarn_config, patched_spark_config, default_bootstrapper: Bootstrapper ) -> None: processing_job_config = { "ProcessingResources": {"ClusterConfig": {"InstanceType": "foo.xbar", "InstanceCount": 123}} } instance_type_info = {"foo.xbar": {"MemoryInfo": {"SizeInMiB": 456}, "VCpuInfo": {"DefaultVCpus": 789}}} default_bootstrapper.load_processing_job_config = MagicMock(return_value=processing_job_config) default_bootstrapper.load_instance_type_info = MagicMock(return_value=instance_type_info) default_bootstrapper.get_yarn_spark_resource_config = MagicMock( return_value=(patched_yarn_config, patched_spark_config) ) default_bootstrapper.set_yarn_spark_resource_config() default_bootstrapper.load_processing_job_config.assert_called_once() default_bootstrapper.load_instance_type_info.assert_called_once() default_bootstrapper.get_yarn_spark_resource_config.assert_called_once_with(123, 456, 789) patched_yarn_config.write_config.assert_called_once() patched_spark_config.write_config.assert_called_once()
def test_load_instance_type_info(patched_exists, default_bootstrapper: Bootstrapper) -> None: raw_config = [ {"InstanceType": "foo.xlarge", "foo": "bar"}, {"InstanceType": "bar.xlarge", "bar": "foo",}, ] exp_config = {"foo.xlarge": {"foo": "bar"}, "bar.xlarge": {"bar": "foo"}} patched_exists.return_value = True with patch("smspark.bootstrapper.open", mock_open(read_data=json.dumps(raw_config))) as m: actual_config = default_bootstrapper.load_instance_type_info() assert actual_config == exp_config patched_exists.assert_called_once_with(Bootstrapper.INSTANCE_TYPE_INFO_PATH) m.assert_called_once_with(Bootstrapper.INSTANCE_TYPE_INFO_PATH, "r")
def start_history_server(event_logs_s3_uri: str) -> None: """Bootstrap the history server instance and starts the Spark history server instance.""" bootstrapper = Bootstrapper() log.info("copying aws jars") bootstrapper.copy_aws_jars() log.info("copying cluster config") bootstrapper.copy_cluster_config() log.info("setting regional configs") bootstrapper.set_regional_configs() log.info("copying history server config") config_history_server(event_logs_s3_uri) log.info("bootstrap master node") bootstrapper.start_spark_standalone_primary() try: subprocess.run("sbin/start-history-server.sh", check=True) except subprocess.CalledProcessError as e: raise AlgorithmError(message=e.stderr.decode(sys.getfilesystemencoding()), caused_by=e, exit_code=e.returncode) except Exception as e: log.error("Exception during processing: " + str(e) + "\n" + traceback.format_exc()) raise AlgorithmError( message="error occurred during start-history-server execution. Please see logs for details.", caused_by=e, )
def __init__( self, resource_config: Dict[str, Any] = None, # type: ignore processing_job_config: Dict[str, Any] = None, # type: ignore ) -> None: """Initialize a ProcessingJobManager, loading configs if not provided.""" logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger("smspark-submit") try: resource_config_path = "/opt/ml/config/resourceconfig.json" with open(resource_config_path, "r") as f: self._resource_config = json.load(f) except Exception: self.logger.warning( "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path) ) self._resource_config = default_resource_config self.logger.info(self._resource_config) try: processing_job_config_path = "/opt/ml/config/processingjobconfig.json" with open(processing_job_config_path, "r") as f: self._processing_job_config = json.load(f) except Exception: self.logger.warning( "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path) ) self._processing_job_config = default_processing_job_config self.logger.info(self._processing_job_config) self.bootstrapper = Bootstrapper(self._resource_config) self.waiter = Waiter() self.status_app = StatusApp() self.status_client = StatusClient()
def test_get_yarn_spark_resource_config( default_bootstrapper: Bootstrapper) -> None: # Using a cluster with one single m5.xlarge instance, calculate Yarn and Spark configs, and double check the math instance_mem_mb = 16384 instance_cores = 4 yarn_config, spark_config = default_bootstrapper.get_yarn_spark_resource_config( 1, instance_mem_mb, instance_cores) exp_yarn_max_mem_mb = 15892 # = int(instance_mem_mb * .97) = int(16384 * .97) = int(15892.48) exp_yarn_config_props = { "yarn.scheduler.minimum-allocation-mb": "1", "yarn.scheduler.maximum-allocation-mb": str(exp_yarn_max_mem_mb), "yarn.scheduler.minimum-allocation-vcores": "1", "yarn.scheduler.maximum-allocation-vcores": str(instance_cores), "yarn.nodemanager.resource.memory-mb": str(exp_yarn_max_mem_mb), "yarn.nodemanager.resource.cpu-vcores": str(instance_cores), } assert yarn_config.Classification == "yarn-site" assert yarn_config.Properties == exp_yarn_config_props exp_executor_cores = 4 # = instance_cores = 4 exp_executor_count_total = 1 # = instance_count * executor_count_per_instance = 1 * 1 exp_default_parallelism = 8 # = instance_count * instance_cores * 2 = 1 * 4 * 2 exp_driver_mem_mb = 2048 # = 2 * 1024 exp_driver_mem_ovr_mb = 204 # = int(driver_mem_mb * driver_mem_ovr_pct) = int(2048 * 0.1) = int(204.8) # = int((instance_mem_mb - driver_mem_mb - driver_mem_ovr_mb) / # (executor_count_per_instance + executor_count_per_instance * executor_mem_ovr_pct)) # = int((15892 - 2048 - 204) / (1 + 1 * 0.1)) # = int(13640 / 1.1) exp_executor_mem_mb = 12399 exp_executor_mem_ovr_mb = 1239 # = int(executor_mem_mb * executor_mem_ovr_pct) = int(12399 * 0.1) = int(1239.9) exp_driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " "-XX:+CMSClassUnloadingEnabled") exp_driver_java_opts = "-XX:OnOutOfMemoryError='kill -9 %p' " f"{exp_driver_gc_config}" # ConcGCThreads = max(int(executor_cores / 4), 1) = max(int(4 / 4), 1) = max(1, 1) = 1 # ParallelGCThreads = max(int(3 * executor_cores / 4), 1) = max(int(3 * 4 / 4), 1) = max(3, 1) = 3 exp_executor_gc_config = ( "-XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 " "-XX:ConcGCThreads=1 " "-XX:ParallelGCThreads=3 ") exp_executor_java_opts = ( "-verbose:gc -XX:OnOutOfMemoryError='kill -9 %p' " "-XX:+PrintGCDetails -XX:+PrintGCDateStamps " f"{exp_executor_gc_config}") exp_spark_config_props = { "spark.driver.memory": f"{exp_driver_mem_mb}m", "spark.driver.memoryOverhead": f"{exp_driver_mem_ovr_mb}m", "spark.driver.defaultJavaOptions": f"{exp_driver_java_opts}", "spark.executor.memory": f"{exp_executor_mem_mb}m", "spark.executor.memoryOverhead": f"{exp_executor_mem_ovr_mb}m", "spark.executor.cores": f"{exp_executor_cores}", "spark.executor.defaultJavaOptions": f"{exp_executor_java_opts}", "spark.executor.instances": f"{exp_executor_count_total}", "spark.default.parallelism": f"{exp_default_parallelism}", } assert spark_config.Classification == "spark-defaults" assert spark_config.Properties == exp_spark_config_props # Using the same instance type, increase the instance count by 10x yarn_config, spark_config = default_bootstrapper.get_yarn_spark_resource_config( 10, instance_mem_mb, instance_cores) # Yarn config should be the same assert yarn_config.Properties == exp_yarn_config_props # Spark config should be the same with more 10x executors and parallelism exp_spark_config_props[ "spark.executor.instances"] = f"{exp_executor_count_total * 10}" exp_spark_config_props[ "spark.default.parallelism"] = f"{exp_default_parallelism * 10}" assert spark_config.Properties == exp_spark_config_props
def test_load_instance_type_info(patched_exists, default_bootstrapper: Bootstrapper) -> None: patched_exists.return_value = False assert default_bootstrapper.load_instance_type_info() == {} patched_exists.assert_called_once_with( Bootstrapper.INSTANCE_TYPE_INFO_PATH)
def test_load_processing_job_config_fallback( patched_exists, default_bootstrapper: Bootstrapper) -> None: patched_exists.return_value = False assert default_bootstrapper.load_processing_job_config() == {} patched_exists.assert_called_once_with( Bootstrapper.PROCESSING_JOB_CONFIG_PATH)
def test_get_regional_configs_missing_region( patched_getenv, default_bootstrapper: Bootstrapper) -> None: patched_getenv.return_value = None regional_configs_list = default_bootstrapper.get_regional_configs() assert len(regional_configs_list) == 0 patched_getenv.assert_called_once_with("AWS_REGION")
def default_bootstrapper() -> Bootstrapper: return Bootstrapper(default_resource_config)
class ProcessingJobManager(object): """Manages the lifecycle of a Spark job.""" def __init__( self, resource_config: Dict[str, Any] = None, # type: ignore processing_job_config: Dict[str, Any] = None, # type: ignore ) -> None: """Initialize a ProcessingJobManager, loading configs if not provided.""" logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger("smspark-submit") try: resource_config_path = "/opt/ml/config/resourceconfig.json" with open(resource_config_path, "r") as f: self._resource_config = json.load(f) except Exception: self.logger.warning( "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path) ) self._resource_config = default_resource_config self.logger.info(self._resource_config) try: processing_job_config_path = "/opt/ml/config/processingjobconfig.json" with open(processing_job_config_path, "r") as f: self._processing_job_config = json.load(f) except Exception: self.logger.warning( "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path) ) self._processing_job_config = default_processing_job_config self.logger.info(self._processing_job_config) self.bootstrapper = Bootstrapper(self._resource_config) self.waiter = Waiter() self.status_app = StatusApp() self.status_client = StatusClient() @property def hostname(self) -> str: """Return the current host's hostname.""" return self._resource_config["current_host"] @property def hosts(self) -> Sequence[str]: """Return a sequence of all the hostnames in the cluster.""" return self._resource_config["hosts"] @property def _is_primary_host(self) -> bool: current_host = self.hostname return current_host == self._cluster_primary_host @property def _cluster_primary_host(self) -> str: return sorted(self._resource_config["hosts"])[0] def _wait_for_hostname_resolution(self) -> None: for host in self._resource_config["hosts"]: self._dns_lookup(host) @retry(stop=stop_after_delay(60)) def _dns_lookup(self, host: str) -> None: socket.gethostbyname(host) def run(self, spark_submit_cmd: str, spark_event_logs_s3_uri: str, local_spark_event_logs_dir: str) -> None: """Run a Spark job. First, wait for workers to come up and bootstraps the cluster. Then runs spark-submit, waits until the job succeeds or fails. Worker nodes are shut down gracefully. Args: spark_submit_cmd (str): Command submitted to run spark-submit """ self.logger.info("waiting for hosts") self._wait_for_hostname_resolution() self.logger.info("starting status server") self._start_status_server() self.logger.info("bootstrapping cluster") self._bootstrap_yarn() self.logger.info("starting executor logs watcher") self._start_executor_logs_watcher() if self._is_primary_host: self.logger.info("start log event log publisher") spark_log_publisher = self._start_spark_event_log_publisher( spark_event_logs_s3_uri, local_spark_event_logs_dir ) self.logger.info(f"Waiting for hosts to bootstrap: {self.hosts}") def all_hosts_have_bootstrapped() -> bool: try: host_statuses: Mapping[str, StatusMessage] = self.status_client.get_status(self.hosts) except ConnectionError as e: self.logger.info( f"Got ConnectionError when polling hosts for status. Host may not have come up: {str(e)}.\nTraceback: {traceback.format_exc()}" ) return False self.logger.info(f"Received host statuses: {host_statuses.items()}") has_bootstrapped = [message.status == Status.WAITING for message in host_statuses.values()] return all(has_bootstrapped) self.waiter.wait_for(predicate_fn=all_hosts_have_bootstrapped, timeout=180.0, period=5.0) try: subprocess.run(spark_submit_cmd, check=True, shell=True) self.logger.info("spark submit was successful. primary node exiting.") except subprocess.CalledProcessError as e: self.logger.error( f"spark-submit command failed with exit code {e.returncode}: {str(e)}\n{traceback.format_exc()}" + str(e) + "\n" + traceback.format_exc() ) raise AlgorithmError("spark failed with a non-zero exit code", caused_by=e, exit_code=e.returncode) except Exception as e: self.logger.error("Exception during processing: " + str(e) + "\n" + traceback.format_exc()) raise AlgorithmError( message="error occurred during spark-submit execution. Please see logs for details.", caused_by=e, ) finally: spark_log_publisher.down() spark_log_publisher.join(timeout=20) else: # workers wait until the primary is up, then wait until it's down. def primary_is_up() -> bool: try: self.status_client.get_status([self._cluster_primary_host]) return True except Exception: return False def primary_is_down() -> bool: return not primary_is_up() self.logger.info("waiting for the primary to come up") self.waiter.wait_for(primary_is_up, timeout=60.0, period=1.0) self.logger.info("waiting for the primary to go down") self.waiter.wait_for(primary_is_down, timeout=float("inf"), period=5.0) self.logger.info("primary is down, worker now exiting") def _bootstrap_yarn(self) -> None: self.status_app.status = Status.BOOTSTRAPPING self.bootstrapper.bootstrap_smspark_submit() self.status_app.status = Status.WAITING def _start_executor_logs_watcher(self, log_dir: str = "/var/log/yarn") -> None: # TODO: check Yarn configs for yarn.log.dir/YARN_LOG_DIR, in case of overrides spark_executor_logs_watcher = SparkExecutorLogsWatcher(log_dir) spark_executor_logs_watcher.daemon = True spark_executor_logs_watcher.start() def _start_status_server(self) -> None: server = StatusServer(self.status_app, self.hostname) server.daemon = True server.start() def _start_spark_event_log_publisher( self, spark_event_logs_s3_uri: str, local_spark_event_logs_dir: str ) -> SparkEventLogPublisher: spark_log_publisher = SparkEventLogPublisher(spark_event_logs_s3_uri, local_spark_event_logs_dir) spark_log_publisher.daemon = True spark_log_publisher.start() return spark_log_publisher