Beispiel #1
0
def cancel_jobs(output, verbose):
    """Cancels jobs."""
    filename = os.path.join(output, "cancel_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    for _ in range(60):
        cluster, promoted = Cluster.deserialize(
            output,
            try_promote_to_submitter=True,
            deserialize_jobs=True,
        )
        if not promoted:
            logger.info("Did not get promoted. Sleep.")
            time.sleep(1)
            continue
        if cluster.is_complete():
            cluster.demote_from_submitter()
            logger.info("All jobs are already finished.")
            sys.exit(0)
        submitter = JobSubmitter.load(output)
        submitter.cancel_jobs(cluster)
        sys.exit(0)

    logger.error("Failed to get promoted to submitter.")
    sys.exit(1)
Beispiel #2
0
def submit(config_file, output, force, verbose=False):
    """Submit the pipeline for execution."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"{output} already exists. Delete it or use '--force' to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)
    os.makedirs(output, exist_ok=True)

    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    mgr = PipelineManager.create(config_file, output)
    try:
        mgr.submit_next_stage(1)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    logging.shutdown()
    sys.exit(0)
Beispiel #3
0
def auto_config(extension, inputs, job_post_process_config_file, config_file,
                verbose):
    """Automatically create a configuration."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("auto_config", None, console_level=level)

    if job_post_process_config_file is not None:
        module, class_name, data = JobPostProcess.load_config_from_file(
            job_post_process_config_file)
        JobPostProcess(module, class_name, data)  # ensure everything ok
        job_post_process_config = {
            "module": module,
            "class": class_name,
            "data": data
        }
    else:
        job_post_process_config = None

    # User extension
    registry = Registry()
    if not registry.is_registered(extension):
        raise InvalidExtension(f"Extension '{extension}' is not registered.")

    cli = registry.get_extension_class(extension, ExtensionClassType.CLI)
    config = cli.auto_config(*inputs,
                             job_post_process_config=job_post_process_config)
    print(f"Created configuration with {config.get_num_jobs()} jobs.")
    config.dump(config_file)
    print(f"Dumped configuration to {config_file}.\n")
Beispiel #4
0
def show_events(output,
                names,
                categories=False,
                json_fmt=False,
                names_only=False,
                categories_only=False,
                verbose=False):
    """Shows the events after jobs run.

    \b
    Examples:
    jade show-events
    jade show-events unhandled_error
    jade show-events error -c
    jade show-events --names-only
    jade show-events --categories-only
    """
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("show_results", None, console_level=level)
    results = EventsSummary(output)
    if names_only:
        results.show_event_names()
    elif categories_only:
        results.show_event_categories()
    elif json_fmt:
        print(results.to_json())
    else:
        if not names:
            names = results.list_unique_names()
        for name in names:
            if categories:
                results.show_events_in_category(name)
            else:
                results.show_events(name)
Beispiel #5
0
def create(
    filename,
    append_job_name,
    append_output_dir,
    config_file,
    cancel_on_blocking_job_failure,
    minutes_per_job,
    shuffle,
    strip_whitespace,
    verbose,
):
    """Create a config file from a filename with a list of executable commands."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("auto_config", None, console_level=level)

    config = GenericCommandConfiguration.auto_config(
        filename,
        cancel_on_blocking_job_failure=cancel_on_blocking_job_failure,
        minutes_per_job=minutes_per_job,
        append_job_name=append_job_name,
        append_output_dir=append_output_dir,
    )
    if shuffle:
        config.shuffle_jobs()
    print(f"Created configuration with {config.get_num_jobs()} jobs.")
    indent = None if strip_whitespace else 2
    config.dump(config_file, indent=indent)
    print(f"Dumped configuration to {config_file}.\n")
Beispiel #6
0
def manager_node(output_dir, job_id, verbose):
    """Print the name of the manager node to the console. Requires a single job in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    if job_id not in cluster.job_status.hpc_job_ids:
        print(f"job_id={job_id} is not active", file=sys.stderr)
        sys.exit(1)

    node = hpc_mgr.list_active_nodes(job_id)[0]
    print(node)
Beispiel #7
0
def _run_manager(job_name, output_dir, verbose, manager_script_and_args):
    filename = os.path.join(output_dir,
                            f"run_multi_node_job_manager__{job_name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run manager on %s: %s", socket.gethostname(),
                get_cli_string())

    # Note that the manager receives its own hostname.
    output = {}
    check_run_command(f"jade cluster hostnames {output_dir}", output)
    hostnames = [x for x in output["stdout"].split() if x != ""]
    logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames)
    cmd = " ".join(manager_script_and_args)
    logger.info("Run manager script [%s]", cmd)

    os.environ["JADE_OUTPUT_DIR"] = output_dir
    os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames)
    start = time.time()
    ret = run_command(cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)
    return ret
Beispiel #8
0
def hostnames(output_dir, job_id, verbose):
    """Show the hostnames of active nodes participating in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    nodes = []
    for _job_id in cluster.job_status.hpc_job_ids:
        if job_id is not None and _job_id != job_id:
            continue
        nodes += hpc_mgr.list_active_nodes(_job_id)

    if not nodes:
        print("No nodes were detected.", file=sys.stderr)
        sys.exit(1)

    print(" ".join(nodes))
Beispiel #9
0
def create(filename, config_file, verbose):
    """Create a config file from a filename with a list of executable commands."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("auto_config", None, console_level=level)

    config = GenericCommandConfiguration.auto_config(filename)
    print(f"Created configuration with {config.get_num_jobs()} jobs.")
    config.dump(config_file)
    print(f"Dumped configuration to {config_file}.\n")
Beispiel #10
0
def test_setup_logging(mock_get_logger, mock_dict_config):
    """Should called dictConfig and getLogger methods"""
    # Call
    name = "this_is_a_logger_name"
    filename = "this_is_a_file_name"
    setup_logging(name, filename)

    # Assertions
    mock_dict_config.assert_called_once()
    mock_get_logger.assert_called_with(name)
Beispiel #11
0
def show_times(output_dirs, verbose):
    """Show the run times of all allocated jobs."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)

    job_ids = []
    for output in output_dirs:
        path = Path(output)
        try:
            cluster, _ = Cluster.deserialize(path, deserialize_jobs=False)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)

        job_ids += [
            x.name.split("_")[2].replace(".e", "") for x in path.glob("*.e")
        ]

    job_ids.sort(key=lambda x: int(x))
    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output)

    total_duration = timedelta(seconds=0)
    table = PrettyTable()
    table.field_names = HpcJobStats._fields

    total_aus = 0
    if os.environ.get("NREL_CLUSTER") == "eagle":
        au_parser = get_nrel_eagle_aus
    else:
        au_parser = None

    for job_id in job_ids:
        stats = hpc_mgr.get_job_stats(job_id)
        if stats is None:
            continue
        duration = stats.end - stats.start
        if stats.state == HpcJobStatus.COMPLETE and isinstance(
                stats.end, datetime):
            total_duration += duration
        data = stats._asdict()
        data["state"] = data["state"].value
        if au_parser is not None:
            total_aus += au_parser(duration, stats.qos)
        table.add_row(data.values())

    print(table)
    print(f"\nTotal duration = {total_duration}")
    print("Total hours = {:.2f}".format(total_duration.total_seconds() / 3600))
    if au_parser is not None:
        print("Total AUs = {:.2f}".format(total_aus))
Beispiel #12
0
def show_results(failed, output, successful, post_process,
                 job_name, verbose):
    """Shows the results of a batch of jobs."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("show_results", None, console_level=level)

    if post_process:
        JobPostProcess.show_results(output, job_name)
        sys.exit(0)

    results = ResultsSummary(output)
    results.show_results(only_failed=failed, only_successful=successful)
Beispiel #13
0
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes,
                output, poll_interval, num_processes, rotate_logs, verbose,
                restart_failed, restart_missing, reports,
                try_add_blocked_jobs):
    """Submits jobs for execution, locally or on HPC."""
    os.makedirs(output, exist_ok=True)

    previous_results = []

    if restart_failed:
        failed_job_config = create_config_from_previous_run(
            config_file, output, result_type='failed')
        previous_results = ResultsSummary(output).get_successful_results()
        config_file = "failed_job_inputs.json"
        failed_job_config.dump(config_file)

    if restart_missing:
        missing_job_config = create_config_from_previous_run(
            config_file, output, result_type='missing')
        config_file = "missing_job_inputs.json"
        missing_job_config.dump(config_file)
        previous_results = ResultsSummary(output).list_results()

    if rotate_logs:
        rotate_filenames(output, ".log")

    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    event_file = os.path.join(output, "submit_jobs_events.log")
    # This effectively means no console logging.
    setup_logging("event",
                  event_file,
                  console_level=logging.ERROR,
                  file_level=logging.INFO)

    mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output)
    ret = mgr.submit_jobs(
        per_node_batch_size=per_node_batch_size,
        max_nodes=max_nodes,
        force_local=local,
        verbose=verbose,
        num_processes=num_processes,
        poll_interval=poll_interval,
        previous_results=previous_results,
        reports=reports,
        try_add_blocked_jobs=try_add_blocked_jobs,
    )

    sys.exit(ret.value)
Beispiel #14
0
def run_worker(job_name, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    hostname = socket.gethostname()
    filename = os.path.join(output_dir, f"run_multi_node_job_worker__{job_name}__{hostname}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level, mode="w")
    logger.info("Run worker: %s", get_cli_string())

    shutdown_file = _get_shutdown_file(job_name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    return 0
Beispiel #15
0
def resubmit_jobs(output, failed, missing, verbose):
    """Resubmit failed and missing jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print(
            "resubmit-jobs requires that the existing submission be complete",
            file=sys.stderr)
        sys.exit(1)
    assert promoted

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(
        jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit,
                                     updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
Beispiel #16
0
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    logger.error("in worker manager_node=%s job=%s", manager_node, job.name)
    hostname = socket.gethostname()
    filename = os.path.join(
        output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run worker: %s", get_cli_string())

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())

    # Give the master a head start.
    time.sleep(10)
    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    logs_dir = job_output / "spark" / "logs"
    job_conf_dir = job_output / "spark" / "conf"
    workers_dir = job_output / "spark" / "workers"
    _set_env_variables(job, job_conf_dir, logs_dir)
    worker_memory = _get_worker_memory_str(job, is_master=False)
    cmd = _get_worker_command(job, manager_node, worker_memory)
    ret = 1
    output = {}
    for _ in range(5):
        output.clear()
        logger.info("Run spark worker: [%s]", cmd)
        ret = run_command(cmd, output=output)
        if ret == 0:
            break
    if ret != 0:
        logger.error("Failed to start spark worker: %s: %s", ret, output)

    shutdown_file = _get_shutdown_file(job.name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    check_run_command(job.model.spark_config.get_stop_worker())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / hostname)
    return 0
Beispiel #17
0
def prune_files(output):
    """Deletes intermediate scripts, config files, and log files that are not needed if the job
    results were successful."""
    setup_logging(__name__, None, console_level=logging.INFO)
    base_path = Path(output)

    count = 0
    for path in itertools.chain(
        # Keep submit_jobs.log* files because there aren't many of them and they are useful.
        base_path.glob("submit_jobs_events*.log*"),
        base_path.glob("run_jobs_batch*.log*"),
        base_path.glob("config_batch*.json"),
        base_path.glob("*.sh"),
    ):
        path.unlink()
        count += 1
    print(f"Deleted {count} files from {output}.")
Beispiel #18
0
def run_jobs(config_file, output, num_processes, verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)
    filename = os.path.join(output, f"run_jobs_batch_{batch_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=logging.ERROR)
    logger.info(get_cli_string())

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)
    ret = mgr.run_jobs(verbose=verbose, num_processes=num_processes)
    sys.exit(ret.value)
Beispiel #19
0
def test_resource_stats():
    with tempfile.TemporaryDirectory() as tmpdir:
        event_file = os.path.join(tmpdir, "events.log")
        setup_logging("event",
                      event_file,
                      console_level=logging.ERROR,
                      file_level=logging.INFO)

        resource_monitor = ResourceMonitor("test")
        count = 2
        found_cpu = 0
        found_disk = 0
        found_mem = 0
        found_net = 0
        for i in range(count):
            resource_monitor.log_resource_stats()

        summary = EventsSummary(tmpdir)
        assert len(summary.list_events(EVENT_NAME_CPU_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_DISK_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_MEMORY_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_NETWORK_STATS)) == count

        viewers = [
            CpuStatsViewer(summary),
            DiskStatsViewer(summary),
            MemoryStatsViewer(summary),
            NetworkStatsViewer(summary),
        ]
        for viewer in viewers:
            df = viewer.get_dataframe("test")
            assert len(df) == 2
            if isinstance(viewer, MemoryStatsViewer):
                mem_df = viewer.get_dataframe("test")
                averages = viewer._calc_batch_averages("test")
                for field, val in averages.items():
                    assert val == df[field].mean()

        output = {}
        cmd = f"jade stats show -o {tmpdir} cpu disk mem net"
        ret = run_command(cmd, output=output)
        assert ret == 0
        for term in ("IOPS", "read_bytes", "bytes_recv", "idle"):
            assert term in output["stdout"]
Beispiel #20
0
def submit_next_stage(output, stage_num, return_code, verbose=False):
    """Internal command to submit the next stage of the pipeline for execution."""
    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    mgr = PipelineManager.load(output)
    try:
        mgr.submit_next_stage(stage_num, return_code=return_code)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    logging.shutdown()
    sys.exit(0)
Beispiel #21
0
def am_i_manager(output_dir, verbose):
    """Print 'true' or 'false' depending on whether the current node is the manager node."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    am_manager = hpc_mgr.am_i_manager()
    print(str(am_manager).lower(), end="")
Beispiel #22
0
def list_active_ids(output_dirs, verbose):
    """List the HPC job IDs that are pending or running."""
    # TODO: add flag for only pending or only running
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)

    job_ids = []
    for output in output_dirs:
        path = Path(output)
        try:
            cluster, _ = Cluster.deserialize(path, deserialize_jobs=True)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)

        if not cluster.is_complete():
            job_ids += list(cluster.iter_hpc_job_ids())

    job_ids.sort(key=lambda x: int(x))
    print(" ".join(job_ids))
Beispiel #23
0
def run_jobs(config_file, distributed_submitter, output, num_processes,
             verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)

    # Logging has to get enabled after the JobRunner is created because we need the node ID
    # is what makes the file unique.
    filename = os.path.join(output,
                            f"run_jobs_batch_{batch_id}_{mgr.node_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_event_logging(mgr.event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    group = mgr.config.get_default_submission_group()
    if group.submitter_params.node_setup_script:
        cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}"
        ret = run_command(cmd)
        if ret != 0:
            logger.error("Failed to run node setup script %s: %s", cmd, ret)
            sys.exit(ret)

    status = mgr.run_jobs(distributed_submitter=distributed_submitter,
                          verbose=verbose,
                          num_processes=num_processes)
    ret = status.value

    if group.submitter_params.node_shutdown_script:
        cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}"
        ret2 = run_command(cmd)
        if ret2 != 0:
            logger.error("Failed to run node shutdown script %s: %s", cmd,
                         ret2)

    if status == Status.GOOD and distributed_submitter:
        start = time.time()
        _try_submit_jobs(output, verbose=verbose)
        logger.info("try-submit-jobs took %s seconds", time.time() - start)

    sys.exit(ret)
Beispiel #24
0
def show_results(failed, output, successful, post_process, job_name, verbose):
    """Shows the results of a batch of jobs."""
    if not Path(output).exists():
        print(f"{output} does not exist", file=sys.stderr)
        sys.exit(1)

    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("show_results", None, console_level=level)

    if post_process:
        JobPostProcess.show_results(output, job_name)
        sys.exit(0)

    try:
        results = ResultsSummary(output)
    except InvalidConfiguration:
        print(
            f"No results are available in {output}. To check status of in-progress jobs run "
            f"'jade show-status -o {output}'",
            file=sys.stderr,
        )
        sys.exit(1)

    results.show_results(only_failed=failed, only_successful=successful)
Beispiel #25
0
def submit(config_file, output, verbose=False):
    """Submit the pipeline for execution."""
    global logger
    os.makedirs(output, exist_ok=True)
    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    logger = setup_logging(__name__, filename, file_level=level,
                           console_level=level)

    logger.info(get_cli_string())

    mgr = PipelineManager(config_file, output)
    try:
        mgr.submit(verbose=verbose)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    sys.exit(0)
Beispiel #26
0
    def __init__(
        self,
        config_file,
        output=OUTPUT_DIR,
        batch_id=0,
    ):
        super(JobRunner, self).__init__(config_file, output)

        self._intf, self._intf_type = self._create_node_interface()
        self._batch_id = batch_id
        self._event_file = os.path.join(
            output,
            f"run_jobs_batch_{batch_id}_events.log",
        )
        self._event_logger = setup_logging("event",
                                           self._event_file,
                                           console_level=logging.ERROR,
                                           file_level=logging.INFO)

        logger.debug("Constructed JobRunner output=%s batch=%s", output,
                     batch_id)
Beispiel #27
0
def _run_cluster_master(job, manager_node, output_dir, verbose,
                        manager_script_and_args):
    filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(),
                job.name, get_cli_string())

    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    if job_output.exists():
        shutil.rmtree(job_output)
    job_output.mkdir(parents=True)
    events_dir = job_output / "spark" / "events"
    events_dir.mkdir(parents=True)
    logs_dir = job_output / "spark" / "logs"
    logs_dir.mkdir()
    workers_dir = job_output / "spark" / "workers"
    workers_dir.mkdir()

    # Make a job-specific conf directory because the log and event files need to be per-job.
    job_conf_dir = job_output / "spark" / "conf"
    shutil.copytree(
        Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir)
    _fix_spark_conf_file(job_conf_dir, events_dir)
    _set_env_variables(job, job_conf_dir, logs_dir)

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())
    run_command(job.model.spark_config.get_stop_history_server())
    run_command(job.model.spark_config.get_stop_master())

    # It would be better to start all workers from the master. Doing so would require that
    # Spark processes on the master node be able to ssh into the worker nodes.
    # I haven't spent the time to figure out to do that inside Singularity containers.
    master_cmd = job.model.spark_config.get_start_master()
    logger.info("Run spark master: [%s]", master_cmd)
    check_run_command(master_cmd)
    history_cmd = job.model.spark_config.get_start_history_server()
    logger.info("Run spark history server: [%s]", history_cmd)
    check_run_command(history_cmd)
    worker_memory = _get_worker_memory_str(job, is_master=True)

    worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory)
    logger.info("Run spark worker: [%s]", worker_cmd)
    check_run_command(worker_cmd)

    # Wait for workers.
    # TODO: find a way to check programmatically with the rest api
    # or parse the logs
    time.sleep(15)
    args = list(manager_script_and_args) + [
        _get_cluster(manager_node),
        str(job_output)
    ]
    if job.model.spark_config.run_user_script_inside_container:
        user_cmd = str(job.model.spark_config.get_run_user_script()
                       ) + " " + " ".join(args)
    else:
        user_cmd = " ".join(args)
    logger.info("Run user script [%s]", user_cmd)

    start = time.time()
    ret = run_command(user_cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)

    # Delay to ensure the history is saved.
    time.sleep(10)
    metrics = SparkMetrics("localhost", history=True)
    try:
        metrics.generate_metrics(job_output / "spark_metrics")
    except Exception:
        logger.exception("Failed to generate metrics")

    check_run_command(job.model.spark_config.get_stop_worker())
    check_run_command(job.model.spark_config.get_stop_history_server())
    check_run_command(job.model.spark_config.get_stop_master())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / socket.gethostname())
    return ret
Beispiel #28
0
def pipeline():
    """Manage JADE execution pipeline."""
    setup_logging("pipeline", None)
Beispiel #29
0
def extensions():
    """Manage JADE extensions."""
    setup_logging("extensions", None)
Beispiel #30
0
def config():
    """Manage a JADE configuration."""
    setup_logging("config", None)