def write_results(self, filename): """Write the results to filename in the output directory.""" data = OrderedDict() data["jade_version"] = jade.version.__version__ now = datetime.datetime.now() data["timestamp"] = now.strftime("%m/%d/%Y %H:%M:%S") data["base_directory"] = os.getcwd() results = self._build_results() data["results_summary"] = results["summary"] data["results"] = results["results"] data["job_outputs"] = \ self._config.job_execution_class().collect_results( os.path.join(self._output, JOBS_OUTPUT_DIR)) output_file = os.path.join(self._output, filename) dump_data(data, output_file) logger.info("Wrote results to %s.", output_file) num_failed = results["summary"]["num_failed"] log_func = logger.info if num_failed == 0 else logger.warning log_func("Successful=%s Failed=%s Total=%s", results["summary"]["num_successful"], num_failed, results["summary"]["total"]) return output_file
def finalize(self, output_dir): """Finalize the stat summaries and record the results. Parameters ---------- output_dir : str Directory in which to record the results. """ for resource_type, stat_dict in self._summaries["sum"].items(): for stat_name, val in stat_dict.items(): self._summaries["average"][resource_type][stat_name] = val / self._count self._summaries.pop("sum") stat_summaries = [] for resource_type in ( CpuStatsViewer.metric(), DiskStatsViewer.metric(), MemoryStatsViewer.metric(), NetworkStatsViewer.metric(), ): # Make each entry look like what the stat viewers produce. summary = {"batch": self.name, "type": resource_type} for stat_type in self._summaries.keys(): summary[stat_type] = self._summaries[stat_type][resource_type] stat_summaries.append(summary) path = Path(output_dir) / STATS_DIR filename = path / f"{self.name}_resource_stats.json" dump_data(stat_summaries, filename)
def test_resubmit_successful(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS dump_data(groups, SG_FILE) check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful") check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def serialize_for_execution(self, scratch_dir, are_inputs_local=True): """Serialize config data for efficient execution. Parameters ---------- scratch_dir : str Temporary storage space on the local system. are_inputs_local : bool Whether the existing input data is local to this system. For many configurations accessing the input data across the network by many concurrent workers can cause a bottleneck and so implementations may wish to copy the data locally before execution starts. If the storage access time is very fast the question is irrelevant. Returns ------- str Name of serialized config file in scratch directory. """ self._transform_for_local_execution(scratch_dir, are_inputs_local) # Split up the jobs to individual files so that each worker can just # read its own info. self.serialize_jobs(scratch_dir) data = self.serialize(ConfigSerializeOptions.JOB_NAMES) config_file = os.path.join(scratch_dir, CONFIG_FILE) dump_data(data, config_file, cls=ExtendedJSONEncoder) logger.info("Dumped config file locally to %s", config_file) return config_file
def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def write_results_summary(self, filename, missing_jobs): """Write the results to filename in the output directory.""" data = OrderedDict() data["jade_version"] = jade.version.__version__ now = datetime.datetime.now() data["timestamp"] = now.strftime("%m/%d/%Y %H:%M:%S") data["base_directory"] = os.getcwd() results = self._build_results(missing_jobs) data["results_summary"] = results["summary"] data["missing_jobs"] = missing_jobs data["results"] = results["results"] output_file = os.path.join(self._output, filename) dump_data(data, output_file) logger.info("Wrote results to %s.", output_file) num_successful = results["summary"]["num_successful"] num_canceled = results["summary"]["num_canceled"] num_failed = results["summary"]["num_failed"] num_missing = len(missing_jobs) total = num_successful + num_failed + num_missing log_func = logger.info if num_successful == total else logger.warning log_func( "Successful=%s Failed=%s Canceled=%s Missing=%s Total=%s", num_successful, num_failed, num_canceled, num_missing, total, ) return output_file
def generate_metrics(self, output_dir: Path): """Generate metrics from a Spark cluster into files at the given path. Parameters ---------- cluster: str Cluster master node name output_dir: Path Output directory in which to write files """ output_dir.mkdir(exist_ok=True, parents=True) apps = self.list_applications() results = {"metrics": []} for app in apps: metrics = { "application": app, "executors": self._submit_request(self._endpoint, app["id"], "executors"), "jobs": self._submit_request(self._endpoint, app["id"], "jobs"), } results["metrics"].append(metrics) filename = output_dir / self.METRICS_FILE dump_data(results, filename, indent=2) logger.info("Recorded metrics in %s", filename)
def __init__( self, config_file, hpc_config=DEFAULTS["hpc_config_file"], output=DEFAULTS["output"], ): """Constructs JobSubmitter. Parameters ---------- config_file : JobConfiguration configuration for simulation hpc_config : dict, optional HPC configuration parameters Job timeout in seconds output : str Output directory """ super(JobSubmitter, self).__init__(config_file, output) self._hpc = None master_file = os.path.join(output, CONFIG_FILE) shutil.copyfile(config_file, master_file) self._config_file = master_file logger.debug("Copied %s to %s", config_file, master_file) if isinstance(hpc_config, str): self._hpc_config_file = hpc_config else: assert isinstance(hpc_config, dict) self._hpc_config_file = os.path.join(self._output, "hpc_config.toml") dump_data(hpc_config, self._hpc_config_file)
def serialize_submission_groups(self, directory): """Serialize the submission groups so that they can be read without acquiring a lock. Parameters ---------- directory : Path """ path = directory / self.SUBMITTER_GROUP_FILE data = [x.dict() for x in self._config.submission_groups] dump_data(data, path, cls=ExtendedJSONEncoder)
def _serialize_registry(self): data = {"extensions": [], "logging": list(self._loggers)} for _, extension in sorted(self._extensions.items()): ext = { k: v for k, v in extension.items() if not isinstance(k, ExtensionClassType) } data["extensions"].append(ext) filename = self.registry_filename dump_data(data, filename, indent=4) logger.debug("Serialized data to %s", filename)
def _remove_demo_extension(self): registry_file = pathlib.Path.home() / self._REGISTRY_FILENAME if not registry_file.exists(): return data = load_data(registry_file) found = False for i, ext in enumerate(data["extensions"]): if ext["name"] == "demo": data["extensions"].pop(i) found = True break if found: dump_data(data, registry_file, indent=2)
def submitter_params( config_file=None, dry_run=None, per_node_batch_size=None, hpc_config=None, local=None, max_nodes=None, poll_interval=None, resource_monitor_interval=None, resource_monitor_type=None, num_processes=None, verbose=None, reports=None, enable_singularity=None, container=None, try_add_blocked_jobs=None, time_based_batching=None, node_setup_script=None, node_shutdown_script=None, no_distributed_submitter=None, ): """Create parameters for use in 'jade submit-jobs'.""" params = make_submitter_params( per_node_batch_size=per_node_batch_size, dry_run=dry_run, hpc_config=hpc_config, local=local, max_nodes=max_nodes, poll_interval=poll_interval, resource_monitor_interval=resource_monitor_interval, resource_monitor_type=resource_monitor_type, num_processes=num_processes, verbose=verbose, reports=reports, enable_singularity=enable_singularity, container=container, try_add_blocked_jobs=try_add_blocked_jobs, time_based_batching=time_based_batching, node_setup_script=node_setup_script, node_shutdown_script=node_shutdown_script, no_distributed_submitter=no_distributed_submitter, ) # This converts enums to values. data = json.loads(params.json()) if config_file.suffix == ".json": dump_data(data, config_file, indent=2) else: dump_data(data, config_file) print(f"Created submitter parameter file {config_file}")
def serialize_jobs(self, directory): """Serializes main job data to job-specific files. Parameters ---------- directory : str """ for job in self.iter_jobs(): basename = job.name + ".json" job_filename = os.path.join(directory, basename) dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder) # We will need this to deserialize from a filename that includes only # job names. self._jobs_directory = directory
def _check_registry_config(self, filename): data = load_data(filename) if isinstance(data, list): # Workaround to support the old registry format. 03/06/2020 # It can be removed eventually. new_data = { "extensions": data, "logging": DEFAULT_REGISTRY["logging"], } dump_data(new_data, self.registry_filename, indent=4) print( "\nReformatted registry. Refer to `jade extensions --help` " "for instructions on adding logging for external packages.\n") data = new_data return data
def save_submission_groups(output_dir, config_file, force): if config_file.exists() and not force: print( f"{config_file} exists. Use a different name or pass --force to overwrite.", file=sys.stderr, ) sys.exit(1) existing_groups_file = output_dir / Cluster.SUBMITTER_GROUP_FILE if not existing_groups_file.exists(): print(f"{output_dir} is not a valid JADE output directory", file=sys.stderr) sys.exit(1) data = load_data(existing_groups_file) dump_data(data, config_file, indent=2) print(f"Copied submission groups to {config_file}")
def upgrade_config_file(data, filename): """Upgrades v0.1.0 format to the latest.""" if data["class"] != "GenericCommandConfiguration": raise Exception( f"{filename} has an old format and must be regenerated") data[ "configuration_module"] = "jade.extensions.generic_command.generic_command_configuration" data["configuration_class"] = "GenericCommandConfiguration" data["format_version"] = JobConfiguration.FORMAT_VERSION data.pop("class") data.pop("extension") for job in data["jobs"]: job["extension"] = "generic_command" job["append_output_dir"] = False dump_data(data, filename, indent=2) logger.info("Upgraded config file format: %s", filename)
def _make_async_submitter(self, jobs, num_processes, output, verbose): config = copy.copy(self._base_config) config["jobs"] = jobs suffix = f"_batch_{self._batch_index}" self._batch_index += 1 new_config_file = self._config_file.replace(".json", f"{suffix}.json") dump_data(config, new_config_file, cls=ExtendedJSONEncoder) logger.info("Created split config file %s with %s jobs", new_config_file, len(config["jobs"])) run_script = os.path.join(output, f"run{suffix}.sh") self._create_run_script(new_config_file, run_script, num_processes, output, verbose) hpc_mgr = HpcManager(self._hpc_config_file, output) name = self._name + suffix return AsyncHpcSubmitter(hpc_mgr, run_script, name, output)
def create_hpc_manager(cluster, config): os.environ["NREL_CLUSTER"] = cluster mgr = None try: hpc_file = "test-hpc-config.toml" dump_data(config, hpc_file) mgr = HpcManager(hpc_file, OUTPUT_DIR) finally: os.remove(hpc_file) if cluster == "eagle": assert mgr.hpc_type == HpcType.SLURM elif cluster == "peregrine": assert mgr.hpc_type == HpcType.PBS else: assert False, "unknown cluster={}".format(cluster) return mgr
def add_submission_group(params_file, name, config_file): """Add a submission group with parameters defined in params_file to config_file.""" config = load_data(config_file) for group in config["submission_groups"]: if name == group["name"]: print(f"Error: {name} is already stored in {config_file}", file=sys.stderr) sys.exit(1) params = load_data(params_file) group = { "name": name, "submitter_params": params, } # Make sure it parses. SubmissionGroup(**group) config["submission_groups"].append(group) dump_data(config, config_file, indent=2) print(f"Updated {config_file} with submission group {name}.")
def hpc(account, config_file, mem, partition, qos, hpc_type, tmp, walltime): """Create an HPC config file.""" if hpc_type == "slurm": hpc = SlurmConfig( account=account, mem=mem, partition=partition, qos=qos, tmp=tmp, walltime=walltime, ) elif hpc_type == "fake": hpc = FakeHpcConfig(walltime=walltime) else: assert hpc_type == "local" hpc = LocalHpcConfig() # This converts enums to values. data = json.loads(HpcConfig(hpc_type=hpc_type, hpc=hpc).json()) dump_data(data, config_file) print(f"Created HPC config file {config_file}")
def create(auto_config_cmds, config_file, submit_params=None): """Create a pipeline with multiple Jade configurations.""" data = {"stages": []} user_submit_params = {} if submit_params: for option in submit_params.split(" "): if "=" in option: param, value = option.split("=") else: param, value = option, "" if param in ["-b", "--per-node-batch-size"]: user_submit_params[param] = int(value) elif param in ["-n", "--max-nodes"]: user_submit_params[param] = int(value) elif param in ["-q", "--num-processes"]: user_submit_params[param] = int(value) elif param in ["-p", "--poll-interval"]: user_submit_params[param] = float(value) else: user_submit_params[param] = value for i, cmd in enumerate(auto_config_cmds): stage_num = i + 1 data["stages"].append({ "auto_config_cmd": cmd, "config_file": PipelineManager.get_stage_config_file_name(stage_num), "submit-params": { "--max-nodes": DEFAULTS["max_nodes"], "--per-node-batch-size": DEFAULTS["per_node_batch_size"], "--num-processes": None, } }) data["stages"][-1]["submit-params"].update(user_submit_params) dump_data(data, config_file) logger.info("Created pipeline config file %s", config_file)
def _check_registry_config(self, filename): data = load_data(filename) if isinstance(data, list): # Workaround to support the old registry format. 03/06/2020 # It can be removed eventually. new_data = { "extensions": data, "logging": DEFAULT_REGISTRY["logging"], } dump_data(new_data, self.registry_filename, indent=4) print( "\nReformatted registry. Refer to `jade extensions --help` " "for instructions on adding logging for external packages.\n") data = new_data format = data.get("format_version", "v0.1.0") if format == "v0.1.0": self.reset_defaults() data = load_data(filename) print("\nWARNING: Reformatted registry. You will need to " "re-register any external extensions.\n") return data
def _make_async_submitter(self, jobs, submission_group, dry_run=False): config = copy.copy(self._base_config) config["jobs"] = jobs suffix = f"_batch_{self._batch_index}" self._batch_index += 1 new_config_file = self._config_file.replace(".json", f"{suffix}.json") dump_data(config, new_config_file, cls=ExtendedJSONEncoder) logger.info("Created split config file %s with %s jobs", new_config_file, len(config["jobs"])) run_script = os.path.join(self._output, f"run{suffix}.sh") self._create_run_script(new_config_file, run_script, submission_group) name = submission_group.submitter_params.hpc_config.job_prefix + suffix return AsyncHpcSubmitter( self._hpc_mgr, self._status_collector, run_script, name, submission_group, self._output, dry_run=dry_run, )
def run(self): """Runs the autoregression, and return status code""" try: result_file, plot_file = autoregression_analysis( country=self._job.country, data=self._job.data, output=self._job_dir) summary_data = { "name": self._job.name, "country": self._job.country, "output": self._output, "result": result_file, "plot": plot_file } summary_file = os.path.join(self._job_dir, "summary.toml") dump_data(summary_data, summary_file) if self._job.country == "australia": raise Exception("test") # Log event into file except Exception: # Create event instance event = StructuredErrorLogEvent( source=self._job.name, category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_UNHANDLED_ERROR, message="Analysis failed!", ) # Log event into file with structured message. log_event(event) # Must raise the exception here, or job returncode is 0 even it fails. raise return 0
def _dump_status(self): self._status_info["current_stage_id"] = self._cur_stage_id dump_data(self._status_info, self._status_file)
def _save_events_summary(self): """Save events to one file per event name.""" for name, events in self._events.items(): dict_events = [event.to_dict() for event in events] dump_data(dict_events, self._make_event_filename(name))
def _filter(config_file, output_file, indices, fields, show_config=False): """Filters jobs in CONFIG_FILE. Prints the new jobs to the console or optionally creates a new file. Note: This does not detect duplicate ranges. \b Examples: 1. Select the first job. Output only. jade config filter c1.json 0 2. Select indices 0-4, 10-14, 20, 25, create new file. jade config filter c1.json :5 10:15 20 25 -o c2.json 3. Select the last 5 jobs. Note the use of '--' to prevent '-5' from being treated as an option. jade config filter c1.json -o c2.json -- -5: 4. Select indices 5 through the end. jade config filter c1.json -o c2.json 5: 5. Select jobs with parameters param1=green and param2=3. jade config filter c1.json -o c2.json -f param1 green -f param2 3 """ cfg = load_data(config_file) jobs = cfg["jobs"] if not jobs: print("The configuration has no jobs") sys.exit(1) if output_file is None: handle, new_config_file = tempfile.mkstemp(suffix=".json") os.close(handle) show_config = True else: new_config_file = output_file try: if not new_config_file.endswith(".json"): print("new_config_file must have extension .json") sys.exit(1) orig_len = len(jobs) new_jobs = [] regex_int = re.compile(r"^(?P<index>\d+)$") regex_range = re.compile(r"^(?P<start>[\d-]*):(?P<end>[\d-]*)$") for index in indices: match = regex_int.search(index) if match: i = int(match.groupdict()["index"]) new_jobs.append(jobs[i]) continue match = regex_range.search(index) if match: start = match.groupdict()["start"] if start == "": start = None else: start = int(start) end = match.groupdict()["end"] if end == "": end = None else: end = int(end) new_jobs += jobs[start:end] # Note: when looking at just the JSON, there is no way to get the job name, # and so we can't check for duplicates. if not new_jobs: new_jobs = jobs if fields: final_jobs = [] for job in new_jobs: matched = True for field in fields: if str(job[field[0]]) != field[1]: matched = False break if matched: final_jobs.append(job) new_jobs = final_jobs cfg["jobs"] = new_jobs new_len = len(cfg["jobs"]) dump_data(cfg, new_config_file, indent=4) print( f"Filtered {config_file} ({orig_len} jobs) into ({new_len} jobs)\n" ) if output_file is not None: print(f"Wrote new config to {output_file}") if show_config: _show(new_config_file, []) finally: if output_file is None: os.remove(new_config_file)
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3
def config( collect_worker_logs, container_path, dynamic_allocation, gpu, hpc_config, master_node_memory_overhead_gb, node_memory_overhead_gb, run_user_script_inside_container, spark_dir, shuffle_partition_multiplier, update_config_file, use_tmpfs_for_scratch, alt_scratch, verbose, worker_memory_gb, force, ): """Create a Spark configuration to use for running a job on a Spark cluster.""" level = logging.DEBUG if verbose else logging.WARNING setup_logging("config_spark", None, console_level=level) spark_dir = Path(spark_dir) if spark_dir.exists(): if force: shutil.rmtree(spark_dir) else: print( f"The directory '{spark_dir}' already exists. Use a different name or pass --force to overwrite.", file=sys.stderr, ) sys.exit(1) spark_dir.mkdir(parents=True) if use_tmpfs_for_scratch and alt is not None: print("use_tmpfs_for_scratch and alt_scratch cannot both be set", file=sys.stderr) sys.exit(1) hpc_config_data = HpcConfig.load(hpc_config) nodes = getattr(hpc_config_data.hpc, "nodes", None) if nodes is None: print( f"hpc_type={hpc_config_data.hpc_type} doesn't have a nodes field", file=sys.stderr) sys.exit(1) mem = getattr(hpc_config_data.hpc, "mem", None) if mem is None: executor_mem_gb = 11 print(f"Use default per-executor memory of {executor_mem_gb}G") else: num_executors = 7 if not mem.endswith("G"): raise Exception( f"This feature only supports HPC memory requirements ending with 'G'" ) per_node_mem_gb = int(mem[:-1]) if use_tmpfs_for_scratch: per_node_mem_gb //= 2 overhead = master_node_memory_overhead_gb - node_memory_overhead_gb executor_mem_gb = (per_node_mem_gb - overhead) // num_executors print( f"Use custom per-executor memory of {executor_mem_gb}G based on per-node {mem}" ) for dirname in ("bin", "conf"): src_path = Path(os.path.dirname(__file__)).parent / "spark" / dirname dst_path = spark_dir / dirname if not dst_path.exists(): dst_path.mkdir() for filename in src_path.iterdir(): shutil.copyfile(filename, dst_path / filename.name) use_gpus = _should_use_gpus(hpc_config_data, gpu) with open(spark_dir / "conf" / "spark-defaults.conf", "a") as f_out: f_out.write("\n") f_out.write(f"spark.executor.memory {executor_mem_gb}G\n") # Online documentation says this value should correlate with the number of cores in the # cluster. Some sources say 1 per core, others say 2 or 4 per core. Depends on use case. # This should be a reasonable default for users, who can customize dynamically. params = ["spark.sql.shuffle.partitions"] # Some sources say that we should set spark.default.parallelism to the same value, # others say it doesn't work. Experiments showed harmful effects if dynamic allocation # was enabled with a custom value. for param in params: f_out.write(param) f_out.write(" ") f_out.write(str(nodes * 35 * shuffle_partition_multiplier)) f_out.write("\n") if dynamic_allocation: f_out.write("\n") f_out.write(DYNAMIC_ALLOCATION_SETTINGS) f_out.write("\n") if use_gpus: src_path = (Path(os.path.dirname(__file__)).parent / "spark" / "conf" / "resourcesFile.json") resources_file = spark_dir / "conf" / "resourcesFile.json" shutil.copyfile(src_path, resources_file) f_out.write( "spark.worker.resource.gpu.discoveryScript /opt/sparkRapidsPlugin/getGpusResources.sh\n" ) f_out.write(f"spark.worker.resourcesFile {resources_file}\n") if use_gpus: filename = spark_dir / "conf" / "spark-env.sh" with open(filename, "a") as f_out: num_gpus = hpc_config_data.get_num_gpus() or 2 f_out.write( f'SPARK_WORKER_OPTS="-Dspark.worker.resource.gpu.amount={num_gpus} ' f'-Dspark.worker.resource.gpu.discoveryScript={GPU_DISCOVERY_SCRIPT}"\n' ) replacement_values = [ ("SPARK_DIR", str(spark_dir)), ("CONTAINER_PATH", container_path), ] for name in ("run_spark_script_wrapper.sh", "run_user_script_wrapper.sh"): filename = spark_dir / "bin" / name _replace_tag(replacement_values, filename) st = os.stat(filename) os.chmod(filename, st.st_mode | stat.S_IEXEC) print(f"Assigned paths in {filename}") scripts = [spark_dir / "conf" / "spark-env.sh"] + list( (spark_dir / "bin").glob("*.sh")) for script in scripts: st = os.stat(script) os.chmod(script, st.st_mode | stat.S_IEXEC) print( f"Created Spark configuration in {spark_dir.absolute()} for a {nodes}-node cluster. " f"GPUs={use_gpus}") spark_config = SparkConfigModel( collect_worker_logs=collect_worker_logs, conf_dir=str(spark_dir), container=SparkContainerModel(path=container_path), enabled=True, master_node_memory_overhead_gb=master_node_memory_overhead_gb, node_memory_overhead_gb=node_memory_overhead_gb, run_user_script_inside_container=run_user_script_inside_container, use_tmpfs_for_scratch=use_tmpfs_for_scratch, alt_scratch=alt_scratch, worker_memory_gb=worker_memory_gb, ) if update_config_file is not None: if not Path(update_config_file).exists(): print(f"'update_config_file={update_config_file} does not exist", file=sys.stderr) sys.exit(1) config = load_data(update_config_file) for job in config["jobs"]: job["spark_config"] = spark_config.dict() dump_data(config, update_config_file, indent=2) print( f"Updated jobs in {update_config_file} with this Spark configuration." ) else: print( "\nAdd and customize this JSON object to the 'spark_config' field for each Spark " "job in your config.json file:\n") print(spark_config.json(indent=2))