Example #1
0
    def write_results(self, filename):
        """Write the results to filename in the output directory."""
        data = OrderedDict()
        data["jade_version"] = jade.version.__version__
        now = datetime.datetime.now()
        data["timestamp"] = now.strftime("%m/%d/%Y %H:%M:%S")
        data["base_directory"] = os.getcwd()
        results = self._build_results()
        data["results_summary"] = results["summary"]
        data["results"] = results["results"]
        data["job_outputs"] = \
            self._config.job_execution_class().collect_results(
                os.path.join(self._output, JOBS_OUTPUT_DIR))

        output_file = os.path.join(self._output, filename)
        dump_data(data, output_file)

        logger.info("Wrote results to %s.", output_file)
        num_failed = results["summary"]["num_failed"]
        log_func = logger.info if num_failed == 0 else logger.warning
        log_func("Successful=%s Failed=%s Total=%s",
                 results["summary"]["num_successful"], num_failed,
                 results["summary"]["total"])

        return output_file
Example #2
0
    def finalize(self, output_dir):
        """Finalize the stat summaries and record the results.

        Parameters
        ----------
        output_dir : str
            Directory in which to record the results.

        """
        for resource_type, stat_dict in self._summaries["sum"].items():
            for stat_name, val in stat_dict.items():
                self._summaries["average"][resource_type][stat_name] = val / self._count

        self._summaries.pop("sum")
        stat_summaries = []
        for resource_type in (
            CpuStatsViewer.metric(),
            DiskStatsViewer.metric(),
            MemoryStatsViewer.metric(),
            NetworkStatsViewer.metric(),
        ):
            # Make each entry look like what the stat viewers produce.
            summary = {"batch": self.name, "type": resource_type}
            for stat_type in self._summaries.keys():
                summary[stat_type] = self._summaries[stat_type][resource_type]
            stat_summaries.append(summary)

        path = Path(output_dir) / STATS_DIR
        filename = path / f"{self.name}_resource_stats.json"
        dump_data(stat_summaries, filename)
Example #3
0
def test_resubmit_successful(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS
    groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS
    dump_data(groups, SG_FILE)

    check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful")
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
Example #4
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #5
0
    def serialize_for_execution(self, scratch_dir, are_inputs_local=True):
        """Serialize config data for efficient execution.

        Parameters
        ----------
        scratch_dir : str
            Temporary storage space on the local system.
        are_inputs_local : bool
            Whether the existing input data is local to this system. For many
            configurations accessing the input data across the network by many
            concurrent workers can cause a bottleneck and so implementations
            may wish to copy the data locally before execution starts. If the
            storage access time is very fast the question is irrelevant.

        Returns
        -------
        str
            Name of serialized config file in scratch directory.

        """
        self._transform_for_local_execution(scratch_dir, are_inputs_local)

        # Split up the jobs to individual files so that each worker can just
        # read its own info.
        self.serialize_jobs(scratch_dir)
        data = self.serialize(ConfigSerializeOptions.JOB_NAMES)
        config_file = os.path.join(scratch_dir, CONFIG_FILE)
        dump_data(data, config_file, cls=ExtendedJSONEncoder)
        logger.info("Dumped config file locally to %s", config_file)

        return config_file
Example #6
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #7
0
    def write_results_summary(self, filename, missing_jobs):
        """Write the results to filename in the output directory."""
        data = OrderedDict()
        data["jade_version"] = jade.version.__version__
        now = datetime.datetime.now()
        data["timestamp"] = now.strftime("%m/%d/%Y %H:%M:%S")
        data["base_directory"] = os.getcwd()
        results = self._build_results(missing_jobs)
        data["results_summary"] = results["summary"]
        data["missing_jobs"] = missing_jobs
        data["results"] = results["results"]

        output_file = os.path.join(self._output, filename)
        dump_data(data, output_file)

        logger.info("Wrote results to %s.", output_file)
        num_successful = results["summary"]["num_successful"]
        num_canceled = results["summary"]["num_canceled"]
        num_failed = results["summary"]["num_failed"]
        num_missing = len(missing_jobs)
        total = num_successful + num_failed + num_missing
        log_func = logger.info if num_successful == total else logger.warning
        log_func(
            "Successful=%s Failed=%s Canceled=%s Missing=%s Total=%s",
            num_successful,
            num_failed,
            num_canceled,
            num_missing,
            total,
        )

        return output_file
Example #8
0
    def generate_metrics(self, output_dir: Path):
        """Generate metrics from a Spark cluster into files at the given path.

        Parameters
        ----------
        cluster: str
            Cluster master node name
        output_dir: Path
            Output directory in which to write files

        """
        output_dir.mkdir(exist_ok=True, parents=True)
        apps = self.list_applications()
        results = {"metrics": []}
        for app in apps:
            metrics = {
                "application": app,
                "executors": self._submit_request(self._endpoint, app["id"], "executors"),
                "jobs": self._submit_request(self._endpoint, app["id"], "jobs"),
            }
            results["metrics"].append(metrics)

        filename = output_dir / self.METRICS_FILE
        dump_data(results, filename, indent=2)
        logger.info("Recorded metrics in %s", filename)
Example #9
0
    def __init__(
        self,
        config_file,
        hpc_config=DEFAULTS["hpc_config_file"],
        output=DEFAULTS["output"],
    ):
        """Constructs JobSubmitter.

        Parameters
        ----------
        config_file : JobConfiguration
            configuration for simulation
        hpc_config : dict, optional
            HPC configuration parameters
            Job timeout in seconds
        output : str
            Output directory

        """
        super(JobSubmitter, self).__init__(config_file, output)
        self._hpc = None
        master_file = os.path.join(output, CONFIG_FILE)
        shutil.copyfile(config_file, master_file)
        self._config_file = master_file
        logger.debug("Copied %s to %s", config_file, master_file)

        if isinstance(hpc_config, str):
            self._hpc_config_file = hpc_config
        else:
            assert isinstance(hpc_config, dict)
            self._hpc_config_file = os.path.join(self._output,
                                                 "hpc_config.toml")
            dump_data(hpc_config, self._hpc_config_file)
Example #10
0
File: cluster.py Project: jgu2/jade
    def serialize_submission_groups(self, directory):
        """Serialize the submission groups so that they can be read without
        acquiring a lock.

        Parameters
        ----------
        directory : Path

        """
        path = directory / self.SUBMITTER_GROUP_FILE
        data = [x.dict() for x in self._config.submission_groups]
        dump_data(data, path, cls=ExtendedJSONEncoder)
Example #11
0
    def _serialize_registry(self):
        data = {"extensions": [], "logging": list(self._loggers)}
        for _, extension in sorted(self._extensions.items()):
            ext = {
                k: v
                for k, v in extension.items()
                if not isinstance(k, ExtensionClassType)
            }
            data["extensions"].append(ext)

        filename = self.registry_filename
        dump_data(data, filename, indent=4)
        logger.debug("Serialized data to %s", filename)
Example #12
0
    def _remove_demo_extension(self):
        registry_file = pathlib.Path.home() / self._REGISTRY_FILENAME
        if not registry_file.exists():
            return

        data = load_data(registry_file)
        found = False
        for i, ext in enumerate(data["extensions"]):
            if ext["name"] == "demo":
                data["extensions"].pop(i)
                found = True
                break
        if found:
            dump_data(data, registry_file, indent=2)
Example #13
0
File: config.py Project: jgu2/jade
def submitter_params(
    config_file=None,
    dry_run=None,
    per_node_batch_size=None,
    hpc_config=None,
    local=None,
    max_nodes=None,
    poll_interval=None,
    resource_monitor_interval=None,
    resource_monitor_type=None,
    num_processes=None,
    verbose=None,
    reports=None,
    enable_singularity=None,
    container=None,
    try_add_blocked_jobs=None,
    time_based_batching=None,
    node_setup_script=None,
    node_shutdown_script=None,
    no_distributed_submitter=None,
):
    """Create parameters for use in 'jade submit-jobs'."""
    params = make_submitter_params(
        per_node_batch_size=per_node_batch_size,
        dry_run=dry_run,
        hpc_config=hpc_config,
        local=local,
        max_nodes=max_nodes,
        poll_interval=poll_interval,
        resource_monitor_interval=resource_monitor_interval,
        resource_monitor_type=resource_monitor_type,
        num_processes=num_processes,
        verbose=verbose,
        reports=reports,
        enable_singularity=enable_singularity,
        container=container,
        try_add_blocked_jobs=try_add_blocked_jobs,
        time_based_batching=time_based_batching,
        node_setup_script=node_setup_script,
        node_shutdown_script=node_shutdown_script,
        no_distributed_submitter=no_distributed_submitter,
    )
    # This converts enums to values.
    data = json.loads(params.json())
    if config_file.suffix == ".json":
        dump_data(data, config_file, indent=2)
    else:
        dump_data(data, config_file)
    print(f"Created submitter parameter file {config_file}")
Example #14
0
    def serialize_jobs(self, directory):
        """Serializes main job data to job-specific files.

        Parameters
        ----------
        directory : str

        """
        for job in self.iter_jobs():
            basename = job.name + ".json"
            job_filename = os.path.join(directory, basename)
            dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder)

        # We will need this to deserialize from a filename that includes only
        # job names.
        self._jobs_directory = directory
Example #15
0
    def _check_registry_config(self, filename):
        data = load_data(filename)
        if isinstance(data, list):
            # Workaround to support the old registry format. 03/06/2020
            # It can be removed eventually.
            new_data = {
                "extensions": data,
                "logging": DEFAULT_REGISTRY["logging"],
            }
            dump_data(new_data, self.registry_filename, indent=4)
            print(
                "\nReformatted registry. Refer to `jade extensions --help` "
                "for instructions on adding logging for external packages.\n")
            data = new_data

        return data
Example #16
0
File: config.py Project: NREL/jade
def save_submission_groups(output_dir, config_file, force):
    if config_file.exists() and not force:
        print(
            f"{config_file} exists. Use a different name or pass --force to overwrite.",
            file=sys.stderr,
        )
        sys.exit(1)

    existing_groups_file = output_dir / Cluster.SUBMITTER_GROUP_FILE
    if not existing_groups_file.exists():
        print(f"{output_dir} is not a valid JADE output directory", file=sys.stderr)
        sys.exit(1)

    data = load_data(existing_groups_file)
    dump_data(data, config_file, indent=2)
    print(f"Copied submission groups to {config_file}")
Example #17
0
def upgrade_config_file(data, filename):
    """Upgrades v0.1.0 format to the latest."""
    if data["class"] != "GenericCommandConfiguration":
        raise Exception(
            f"{filename} has an old format and must be regenerated")

    data[
        "configuration_module"] = "jade.extensions.generic_command.generic_command_configuration"
    data["configuration_class"] = "GenericCommandConfiguration"
    data["format_version"] = JobConfiguration.FORMAT_VERSION
    data.pop("class")
    data.pop("extension")
    for job in data["jobs"]:
        job["extension"] = "generic_command"
        job["append_output_dir"] = False
    dump_data(data, filename, indent=2)
    logger.info("Upgraded config file format: %s", filename)
Example #18
0
    def _make_async_submitter(self, jobs, num_processes, output, verbose):
        config = copy.copy(self._base_config)
        config["jobs"] = jobs
        suffix = f"_batch_{self._batch_index}"
        self._batch_index += 1
        new_config_file = self._config_file.replace(".json", f"{suffix}.json")
        dump_data(config, new_config_file, cls=ExtendedJSONEncoder)
        logger.info("Created split config file %s with %s jobs",
                    new_config_file, len(config["jobs"]))

        run_script = os.path.join(output, f"run{suffix}.sh")
        self._create_run_script(new_config_file, run_script, num_processes,
                                output, verbose)

        hpc_mgr = HpcManager(self._hpc_config_file, output)
        name = self._name + suffix
        return AsyncHpcSubmitter(hpc_mgr, run_script, name, output)
Example #19
0
def create_hpc_manager(cluster, config):
    os.environ["NREL_CLUSTER"] = cluster
    mgr = None
    try:
        hpc_file = "test-hpc-config.toml"
        dump_data(config, hpc_file)

        mgr = HpcManager(hpc_file, OUTPUT_DIR)
    finally:
        os.remove(hpc_file)

    if cluster == "eagle":
        assert mgr.hpc_type == HpcType.SLURM
    elif cluster == "peregrine":
        assert mgr.hpc_type == HpcType.PBS
    else:
        assert False, "unknown cluster={}".format(cluster)

    return mgr
Example #20
0
File: config.py Project: NREL/jade
def add_submission_group(params_file, name, config_file):
    """Add a submission group with parameters defined in params_file to config_file."""
    config = load_data(config_file)
    for group in config["submission_groups"]:
        if name == group["name"]:
            print(f"Error: {name} is already stored in {config_file}", file=sys.stderr)
            sys.exit(1)

    params = load_data(params_file)
    group = {
        "name": name,
        "submitter_params": params,
    }
    # Make sure it parses.
    SubmissionGroup(**group)

    config["submission_groups"].append(group)
    dump_data(config, config_file, indent=2)
    print(f"Updated {config_file} with submission group {name}.")
Example #21
0
File: config.py Project: jgu2/jade
def hpc(account, config_file, mem, partition, qos, hpc_type, tmp, walltime):
    """Create an HPC config file."""
    if hpc_type == "slurm":
        hpc = SlurmConfig(
            account=account,
            mem=mem,
            partition=partition,
            qos=qos,
            tmp=tmp,
            walltime=walltime,
        )
    elif hpc_type == "fake":
        hpc = FakeHpcConfig(walltime=walltime)
    else:
        assert hpc_type == "local"
        hpc = LocalHpcConfig()

    # This converts enums to values.
    data = json.loads(HpcConfig(hpc_type=hpc_type, hpc=hpc).json())
    dump_data(data, config_file)
    print(f"Created HPC config file {config_file}")
Example #22
0
    def create(auto_config_cmds, config_file, submit_params=None):
        """Create a pipeline with multiple Jade configurations."""
        data = {"stages": []}

        user_submit_params = {}
        if submit_params:
            for option in submit_params.split(" "):
                if "=" in option:
                    param, value = option.split("=")
                else:
                    param, value = option, ""

                if param in ["-b", "--per-node-batch-size"]:
                    user_submit_params[param] = int(value)
                elif param in ["-n", "--max-nodes"]:
                    user_submit_params[param] = int(value)
                elif param in ["-q", "--num-processes"]:
                    user_submit_params[param] = int(value)
                elif param in ["-p", "--poll-interval"]:
                    user_submit_params[param] = float(value)
                else:
                    user_submit_params[param] = value

        for i, cmd in enumerate(auto_config_cmds):
            stage_num = i + 1
            data["stages"].append({
                "auto_config_cmd":
                cmd,
                "config_file":
                PipelineManager.get_stage_config_file_name(stage_num),
                "submit-params": {
                    "--max-nodes": DEFAULTS["max_nodes"],
                    "--per-node-batch-size": DEFAULTS["per_node_batch_size"],
                    "--num-processes": None,
                }
            })
            data["stages"][-1]["submit-params"].update(user_submit_params)

        dump_data(data, config_file)
        logger.info("Created pipeline config file %s", config_file)
Example #23
0
    def _check_registry_config(self, filename):
        data = load_data(filename)
        if isinstance(data, list):
            # Workaround to support the old registry format. 03/06/2020
            # It can be removed eventually.
            new_data = {
                "extensions": data,
                "logging": DEFAULT_REGISTRY["logging"],
            }
            dump_data(new_data, self.registry_filename, indent=4)
            print(
                "\nReformatted registry. Refer to `jade extensions --help` "
                "for instructions on adding logging for external packages.\n")
            data = new_data

        format = data.get("format_version", "v0.1.0")
        if format == "v0.1.0":
            self.reset_defaults()
            data = load_data(filename)
            print("\nWARNING: Reformatted registry. You will need to "
                  "re-register any external extensions.\n")
        return data
Example #24
0
    def _make_async_submitter(self, jobs, submission_group, dry_run=False):
        config = copy.copy(self._base_config)
        config["jobs"] = jobs
        suffix = f"_batch_{self._batch_index}"
        self._batch_index += 1
        new_config_file = self._config_file.replace(".json", f"{suffix}.json")
        dump_data(config, new_config_file, cls=ExtendedJSONEncoder)
        logger.info("Created split config file %s with %s jobs",
                    new_config_file, len(config["jobs"]))

        run_script = os.path.join(self._output, f"run{suffix}.sh")
        self._create_run_script(new_config_file, run_script, submission_group)

        name = submission_group.submitter_params.hpc_config.job_prefix + suffix
        return AsyncHpcSubmitter(
            self._hpc_mgr,
            self._status_collector,
            run_script,
            name,
            submission_group,
            self._output,
            dry_run=dry_run,
        )
    def run(self):
        """Runs the autoregression, and return status code"""
        try:
            result_file, plot_file = autoregression_analysis(
                country=self._job.country,
                data=self._job.data,
                output=self._job_dir)
            summary_data = {
                "name": self._job.name,
                "country": self._job.country,
                "output": self._output,
                "result": result_file,
                "plot": plot_file
            }
            summary_file = os.path.join(self._job_dir, "summary.toml")
            dump_data(summary_data, summary_file)
            if self._job.country == "australia":
                raise Exception("test")

        # Log event into file
        except Exception:
            # Create event instance
            event = StructuredErrorLogEvent(
                source=self._job.name,
                category=EVENT_CATEGORY_ERROR,
                name=EVENT_NAME_UNHANDLED_ERROR,
                message="Analysis failed!",
            )

            # Log event into file with structured message.
            log_event(event)

            # Must raise the exception here, or job returncode is 0 even it fails.
            raise

        return 0
Example #26
0
 def _dump_status(self):
     self._status_info["current_stage_id"] = self._cur_stage_id
     dump_data(self._status_info, self._status_file)
Example #27
0
 def _save_events_summary(self):
     """Save events to one file per event name."""
     for name, events in self._events.items():
         dict_events = [event.to_dict() for event in events]
         dump_data(dict_events, self._make_event_filename(name))
Example #28
0
def _filter(config_file, output_file, indices, fields, show_config=False):
    """Filters jobs in CONFIG_FILE. Prints the new jobs to the console or
    optionally creates a new file.

    Note: This does not detect duplicate ranges.

    \b
    Examples:
    1. Select the first job. Output only.
       jade config filter c1.json 0
    2. Select indices 0-4, 10-14, 20, 25, create new file.
       jade config filter c1.json :5 10:15 20 25 -o c2.json
    3. Select the last 5 jobs. Note the use of '--' to prevent '-5' from being
       treated as an option.
       jade config filter c1.json -o c2.json -- -5:
    4. Select indices 5 through the end.
       jade config filter c1.json -o c2.json 5:
    5. Select jobs with parameters param1=green and param2=3.
       jade config filter c1.json -o c2.json -f param1 green -f param2 3

    """
    cfg = load_data(config_file)
    jobs = cfg["jobs"]
    if not jobs:
        print("The configuration has no jobs")
        sys.exit(1)

    if output_file is None:
        handle, new_config_file = tempfile.mkstemp(suffix=".json")
        os.close(handle)
        show_config = True
    else:
        new_config_file = output_file

    try:
        if not new_config_file.endswith(".json"):
            print("new_config_file must have extension .json")
            sys.exit(1)

        orig_len = len(jobs)
        new_jobs = []
        regex_int = re.compile(r"^(?P<index>\d+)$")
        regex_range = re.compile(r"^(?P<start>[\d-]*):(?P<end>[\d-]*)$")
        for index in indices:
            match = regex_int.search(index)
            if match:
                i = int(match.groupdict()["index"])
                new_jobs.append(jobs[i])
                continue
            match = regex_range.search(index)
            if match:
                start = match.groupdict()["start"]
                if start == "":
                    start = None
                else:
                    start = int(start)
                end = match.groupdict()["end"]
                if end == "":
                    end = None
                else:
                    end = int(end)
                new_jobs += jobs[start:end]

        # Note: when looking at just the JSON, there is no way to get the job name,
        # and so we can't check for duplicates.

        if not new_jobs:
            new_jobs = jobs

        if fields:
            final_jobs = []
            for job in new_jobs:
                matched = True
                for field in fields:
                    if str(job[field[0]]) != field[1]:
                        matched = False
                        break
                if matched:
                    final_jobs.append(job)

            new_jobs = final_jobs

        cfg["jobs"] = new_jobs
        new_len = len(cfg["jobs"])
        dump_data(cfg, new_config_file, indent=4)
        print(
            f"Filtered {config_file} ({orig_len} jobs) into ({new_len} jobs)\n"
        )
        if output_file is not None:
            print(f"Wrote new config to {output_file}")

        if show_config:
            _show(new_config_file, [])
    finally:
        if output_file is None:
            os.remove(new_config_file)
Example #29
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3
Example #30
0
def config(
    collect_worker_logs,
    container_path,
    dynamic_allocation,
    gpu,
    hpc_config,
    master_node_memory_overhead_gb,
    node_memory_overhead_gb,
    run_user_script_inside_container,
    spark_dir,
    shuffle_partition_multiplier,
    update_config_file,
    use_tmpfs_for_scratch,
    alt_scratch,
    verbose,
    worker_memory_gb,
    force,
):
    """Create a Spark configuration to use for running a job on a Spark cluster."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("config_spark", None, console_level=level)
    spark_dir = Path(spark_dir)
    if spark_dir.exists():
        if force:
            shutil.rmtree(spark_dir)
        else:
            print(
                f"The directory '{spark_dir}' already exists. Use a different name or pass --force to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)
    spark_dir.mkdir(parents=True)

    if use_tmpfs_for_scratch and alt is not None:
        print("use_tmpfs_for_scratch and alt_scratch cannot both be set",
              file=sys.stderr)
        sys.exit(1)

    hpc_config_data = HpcConfig.load(hpc_config)
    nodes = getattr(hpc_config_data.hpc, "nodes", None)
    if nodes is None:
        print(
            f"hpc_type={hpc_config_data.hpc_type} doesn't have a nodes field",
            file=sys.stderr)
        sys.exit(1)
    mem = getattr(hpc_config_data.hpc, "mem", None)
    if mem is None:
        executor_mem_gb = 11
        print(f"Use default per-executor memory of {executor_mem_gb}G")
    else:
        num_executors = 7
        if not mem.endswith("G"):
            raise Exception(
                f"This feature only supports HPC memory requirements ending with 'G'"
            )
        per_node_mem_gb = int(mem[:-1])
        if use_tmpfs_for_scratch:
            per_node_mem_gb //= 2
        overhead = master_node_memory_overhead_gb - node_memory_overhead_gb
        executor_mem_gb = (per_node_mem_gb - overhead) // num_executors
        print(
            f"Use custom per-executor memory of {executor_mem_gb}G based on per-node {mem}"
        )

    for dirname in ("bin", "conf"):
        src_path = Path(os.path.dirname(__file__)).parent / "spark" / dirname
        dst_path = spark_dir / dirname
        if not dst_path.exists():
            dst_path.mkdir()
        for filename in src_path.iterdir():
            shutil.copyfile(filename, dst_path / filename.name)

    use_gpus = _should_use_gpus(hpc_config_data, gpu)

    with open(spark_dir / "conf" / "spark-defaults.conf", "a") as f_out:
        f_out.write("\n")
        f_out.write(f"spark.executor.memory {executor_mem_gb}G\n")
        # Online documentation says this value should correlate with the number of cores in the
        # cluster. Some sources say 1 per core, others say 2 or 4 per core. Depends on use case.
        # This should be a reasonable default for users, who can customize dynamically.
        params = ["spark.sql.shuffle.partitions"]
        # Some sources say that we should set spark.default.parallelism to the same value,
        # others say it doesn't work. Experiments showed harmful effects if dynamic allocation
        # was enabled with a custom value.
        for param in params:
            f_out.write(param)
            f_out.write(" ")
            f_out.write(str(nodes * 35 * shuffle_partition_multiplier))
            f_out.write("\n")

        if dynamic_allocation:
            f_out.write("\n")
            f_out.write(DYNAMIC_ALLOCATION_SETTINGS)
            f_out.write("\n")

        if use_gpus:
            src_path = (Path(os.path.dirname(__file__)).parent / "spark" /
                        "conf" / "resourcesFile.json")
            resources_file = spark_dir / "conf" / "resourcesFile.json"
            shutil.copyfile(src_path, resources_file)
            f_out.write(
                "spark.worker.resource.gpu.discoveryScript /opt/sparkRapidsPlugin/getGpusResources.sh\n"
            )
            f_out.write(f"spark.worker.resourcesFile {resources_file}\n")

    if use_gpus:
        filename = spark_dir / "conf" / "spark-env.sh"
        with open(filename, "a") as f_out:
            num_gpus = hpc_config_data.get_num_gpus() or 2
            f_out.write(
                f'SPARK_WORKER_OPTS="-Dspark.worker.resource.gpu.amount={num_gpus} '
                f'-Dspark.worker.resource.gpu.discoveryScript={GPU_DISCOVERY_SCRIPT}"\n'
            )

    replacement_values = [
        ("SPARK_DIR", str(spark_dir)),
        ("CONTAINER_PATH", container_path),
    ]
    for name in ("run_spark_script_wrapper.sh", "run_user_script_wrapper.sh"):
        filename = spark_dir / "bin" / name
        _replace_tag(replacement_values, filename)
        st = os.stat(filename)
        os.chmod(filename, st.st_mode | stat.S_IEXEC)
        print(f"Assigned paths in {filename}")

    scripts = [spark_dir / "conf" / "spark-env.sh"] + list(
        (spark_dir / "bin").glob("*.sh"))
    for script in scripts:
        st = os.stat(script)
        os.chmod(script, st.st_mode | stat.S_IEXEC)

    print(
        f"Created Spark configuration in {spark_dir.absolute()} for a {nodes}-node cluster. "
        f"GPUs={use_gpus}")

    spark_config = SparkConfigModel(
        collect_worker_logs=collect_worker_logs,
        conf_dir=str(spark_dir),
        container=SparkContainerModel(path=container_path),
        enabled=True,
        master_node_memory_overhead_gb=master_node_memory_overhead_gb,
        node_memory_overhead_gb=node_memory_overhead_gb,
        run_user_script_inside_container=run_user_script_inside_container,
        use_tmpfs_for_scratch=use_tmpfs_for_scratch,
        alt_scratch=alt_scratch,
        worker_memory_gb=worker_memory_gb,
    )

    if update_config_file is not None:
        if not Path(update_config_file).exists():
            print(f"'update_config_file={update_config_file} does not exist",
                  file=sys.stderr)
            sys.exit(1)
        config = load_data(update_config_file)
        for job in config["jobs"]:
            job["spark_config"] = spark_config.dict()
        dump_data(config, update_config_file, indent=2)
        print(
            f"Updated jobs in {update_config_file} with this Spark configuration."
        )
    else:
        print(
            "\nAdd and customize this JSON object to the 'spark_config' field for each Spark "
            "job in your config.json file:\n")
        print(spark_config.json(indent=2))