def validate_hosts(ctx, param, host=None):
    """Callback to validate the hostname received as input.

    If we were not given a hostname, we first try to guess it via
    `utils.guess_host`. If this fails, we give up and throw an error.

    Otherwise we compare the provided/guessed host with the list of available
    templates. If the hostname matches the template name, we continue by
    returning the hostname.
    """
    if host is None:
        host = utils.guess_host()
        if host is None:
            raise click.BadParameter(
                "Could not guess host. Please provide a value explicitly.",
                param_hint='"--host"',
            )

    known_hosts = utils.get_possible_hosts()
    if host not in known_hosts:
        console.info("Could not find template for host '{}'.", host)
        utils.print_possible_hosts()
        # TODO: Raise some appropriate error here
        ctx.exit()
        return

    return host
Exemple #2
0
def migrate_to_datreant(directory):
    """Perform the `mdsynthesis` to `datreant` migration."""
    bundles = search_mdsynthesis_sim_files(directory)
    if bundles:
        console.info("Converting old benchmark metadata to new format!")
        convert_to_datreant(bundles)
        console.info("Finished converting old benchmarks to new format!")

    return
def do_submit(directory, force_restart, yes):
    """Submit the benchmarks."""
    # Migrate from MDBenchmark<2 to MDBenchmark=>2
    mds_to_dtr.migrate_to_datreant(directory)

    bundle = dtr.discover(directory)

    # Exit if no bundles were found in the current directory.
    if not bundle:
        console.error("No benchmarks found.")

    grouped_bundles = bundle.categories.groupby("started")
    try:
        bundles_not_yet_started = grouped_bundles[False]
    except KeyError:
        bundles_not_yet_started = None
    if not bundles_not_yet_started and not force_restart:
        console.error(
            "All generated benchmarks were already started once. "
            "You can force a restart with {}.",
            "--force",
        )

    # Start all benchmark simulations if a restart was requested. Otherwise
    # only start the ones that were not run yet.
    bundles_to_start = bundle
    if not force_restart:
        bundles_to_start = bundles_not_yet_started

    df = DataFrameFromBundle(bundles_to_start)

    # Reformat NaN values nicely into question marks.
    df_to_print = df.replace(np.nan, "?")
    df_to_print = df.drop(columns=["ns/day", "ncores"])
    console.info("{}", "Benchmark Summary:")
    df_short = ConsolidateDataFrame(df_to_print)
    PrintDataFrame(df_short)

    # Ask the user to confirm whether they want to submit the benchmarks
    if yes:
        console.info("The above benchmarks will be submitted.")
    elif not click.confirm("The above benchmarks will be submitted. Continue?"):
        console.error("Exiting. No benchmarks submitted.")

    batch_cmd = get_batch_command()
    console.info("Submitting a total of {} benchmarks.", len(bundles_to_start))
    for sim in bundles_to_start:
        # Remove files generated by previous mdbenchmark run
        if force_restart:
            engine = detect_md_engine(sim.categories["module"])
            cleanup_before_restart(engine=engine, sim=sim)
        sim.categories["started"] = True
        os.chdir(sim.abspath)
        subprocess.call([batch_cmd, "bench.job"])
    console.info(
        "Submitted all benchmarks. Run {} once they are finished to get the results.",
        "mdbenchmark analyze",
    )
Exemple #4
0
 def _guess_version(self, categories):
     console.info("Setting up...")
     try:
         if "module" in categories and "version" in categories:
             # Versions >=3 have both a "module" and "version" key
             self.version = "3"
         elif "module" in categories:
             # Version 2 uses "module", but has no "version" key
             self.version = "2"
         else:
             # We found a version that is not enumerated above
             self.version = "next"
     except TypeError:
         # If we point datreant to an empty or non-existent directory, it
         # will throw an error. Catch it and set some default version.
         self.version = "3"
Exemple #5
0
def do_plot(
    csv,
    output_name,
    output_format,
    template,
    module,
    gpu,
    cpu,
    plot_cores,
    fit,
    font_size,
    dpi,
    xtick_step,
    watermark,
):
    """Creates plots of benchmarks."""
    if not csv:
        raise click.BadParameter(
            "You must specify at least one CSV file.", param_hint='"--csv"'
        )

    df = pd.concat([pd.read_csv(c) for c in csv])
    performance_column = "performance" if "performance" in df.columns else "ns/day"

    df = filter_dataframe_for_plotting(df, template, module, gpu, cpu)

    # Exit if there is no performance data
    if df[performance_column].isnull().all():
        console.error("There is no performance data to plot.")

    rcParams["font.size"] = font_size
    fig = Figure()
    FigureCanvas(fig)
    ax = fig.add_subplot(111)
    ax = plot_over_group(
        df=df,
        plot_cores=plot_cores,
        fit=fit,
        performance_column=performance_column,
        ax=ax,
    )

    # Update xticks
    selection = "ncores" if plot_cores else "nodes"
    min_x = df[selection].min() if plot_cores else 1
    max_x = df[selection].max()
    xticks_steps = min_x
    xticks = np.arange(min_x, max_x + min_x, xticks_steps)
    step = get_xsteps(xticks.size, min_x, plot_cores, xtick_step)

    ax.set_xticks(xticks[::step])
    xdiff = min_x * 0.5 * step
    ax.set_xlim(min_x - xdiff, max_x + xdiff)

    # Update yticks
    max_y = df[performance_column].max() or 50
    yticks_steps = int(((max_y + 1) // 10))
    if yticks_steps == 0:
        yticks_steps = 1
    yticks = np.arange(0, max_y + (max_y * 0.25), yticks_steps)
    ax.set_yticks(yticks)
    ax.set_ylim(0, max_y + (max_y * 0.25))

    # Add watermark
    if watermark:
        ax.text(0.025, 0.925, "MDBenchmark", transform=ax.transAxes, alpha=0.3)

    legend = ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.175))
    plt.tight_layout()

    if output_name is None and len(csv) == 1:
        csv_string = csv[0].split(".")[0]
        output_name = "{}.{}".format(csv_string, output_format)
    elif output_name is None and len(csv) != 1:
        output_name = generate_output_name(output_format)
    elif not output_name.endswith(".{}".format(output_format)):
        output_name = "{}.{}".format(output_name, output_format)

    fig.savefig(
        output_name,
        type=output_format,
        bbox_extra_artists=(legend,),
        bbox_inches="tight",
        dpi=dpi,
    )
    console.info("The plot was saved as '{}'.", output_name)
Exemple #6
0
def filter_dataframe_for_plotting(df, host_name, module_name, gpu, cpu):
    if gpu and cpu:
        console.info("Plotting GPU and CPU data.")
    elif gpu and not cpu:
        df = df[df.gpu]
        console.info("Plotting GPU data only.")
    elif cpu and not gpu:
        df = df[~df.gpu]
        console.info("Plotting CPU data only.")
    elif not cpu and not gpu:
        console.error("CPU and GPU not set. Nothing to plot. Exiting.")

    if df.empty:
        console.error("Your filtering led to an empty dataset. Exiting.")

    df_filtered_hosts = df[df["host"].isin(host_name)]
    df_unique_hosts = np.unique(df_filtered_hosts["host"])

    if df_unique_hosts.size != len(host_name):
        console.error(
            "Could not find all provided hosts. Available hosts are: {}".format(
                ", ".join(np.unique(df["host"]))
            )
        )

    if not host_name:
        console.info("Plotting all hosts in input file.")
    else:
        df = df_filtered_hosts
        console.info(
            "Data for the following hosts will be plotted: {}".format(
                ", ".join(df_unique_hosts)
            )
        )

    for module in module_name:
        if module in SUPPORTED_ENGINES.keys():
            console.info("Plotting all modules for engine '{}'.", module)
        elif module in df["module"].tolist():
            console.info("Plotting module '{}'.", module)
        elif module not in df["module"].tolist():
            console.error(
                "The module '{}' does not exist in your data. Exiting.", module
            )

    if not module_name:
        console.info("Plotting all modules in your input data.")

    if module_name:
        df = df[df["module"].str.contains("|".join(module_name))]

    if df.empty:
        console.error(
            "Your selections contained no benchmarking information. "
            "Are you sure all your selections are correct?"
        )

    return df
Exemple #7
0
def do_plot(
    csv,
    output_name,
    output_format,
    template,
    module,
    gpu,
    cpu,
    plot_cores,
    fit,
    font_size,
    dpi,
    xtick_step,
    watermark,
):
    """Creates plots of benchmarks."""
    if not csv:
        raise click.BadParameter("You must specify at least one CSV file.",
                                 param_hint='"--csv"')

    df = pd.concat([pd.read_csv(c, index_col=0) for c in csv]).dropna()

    df = filter_dataframe_for_plotting(df, template, module, gpu, cpu)

    mpl_rcParams["font.size"] = font_size
    fig = Figure()
    FigureCanvas(fig)
    ax = fig.add_subplot(111)
    ax = plot_over_group(df=df, plot_cores=plot_cores, fit=fit, ax=ax)

    # Update xticks
    selection = "ncores" if plot_cores else "nodes"
    min_x = df[selection].min() if plot_cores else 1
    max_x = df[selection].max()
    xticks_steps = min_x
    xticks = np.arange(min_x, max_x + min_x, xticks_steps)
    step = get_xsteps(xticks.size, min_x, plot_cores, xtick_step)

    ax.set_xticks(xticks[::step])
    xdiff = min_x * 0.5 * step
    ax.set_xlim(min_x - xdiff, max_x + xdiff)

    # Update yticks
    max_y = df["ns/day"].max() or 50
    yticks_steps = ((max_y + 1) / 10).astype(int)
    yticks = np.arange(0, max_y + (max_y * 0.25), yticks_steps)
    ax.set_yticks(yticks)
    ax.set_ylim(0, max_y + (max_y * 0.25))

    # Add watermark
    if watermark:
        ax.text(0.025, 0.925, "MDBenchmark", transform=ax.transAxes, alpha=0.3)

    lgd = ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.175))
    plt.tight_layout()

    if output_name is None and len(csv) == 1:
        csv_string = csv[0].split(".")[0]
        output_name = "{}.{}".format(csv_string, output_format)
    elif output_name is None and len(csv) != 1:
        output_name = generate_output_name(output_format)
    elif not output_name.endswith(".{}".format(output_format)):
        output_name = "{}.{}".format(output_name, output_format)
    # tight alone does not consider the legend if it is outside the plot.
    # therefore i add it manually as extra artist. This way we don't get problems
    # with the variability of individual lines which are to be plotted
    fig.savefig(
        output_name,
        type=output_format,
        bbox_extra_artists=(lgd, ),
        bbox_inches="tight",
        dpi=dpi,
    )
    console.info("Your file was saved as '{}' in the working directory.",
                 output_name)
Exemple #8
0
def filter_dataframe_for_plotting(df, host_name, module_name, gpu, cpu):
    # gpu/cpu can be plotted together or separately
    if gpu and cpu:
        # if no flags are given by the user or both are set everything is plotted
        console.info("Plotting GPU and CPU data.")
    elif gpu and not cpu:
        df = df[df.gpu]
        console.info("Plotting GPU data only.")
    elif cpu and not gpu:
        df = df[~df.gpu]
        console.info("Plotting CPU data only.")
    elif not cpu and not gpu:
        console.error("CPU and GPU not set. Nothing to plot. Exiting.")

    if df.empty:
        console.error("Your filtering led to an empty dataset. Exiting.")

    df_filtered_hosts = df[df["host"].isin(host_name)]
    df_unique_hosts = np.unique(df_filtered_hosts["host"])

    if df_unique_hosts.size != len(host_name):
        console.error(
            "Could not find all provided hosts. Available hosts are: {}".
            format(", ".join(np.unique(df["host"]))))

    if not host_name:
        console.info("Plotting all hosts in input file.")
    else:
        df = df_filtered_hosts
        console.info("Data for the following hosts will be plotted: {}".format(
            ", ".join(df_unique_hosts)))

    for module in module_name:
        if module in ["gromacs", "namd"]:
            console.info("Plotting all modules for engine '{}'.", module)
        elif module in df["module"].tolist():
            console.info("Plotting module '{}'.", module)
        elif module not in df["module"].tolist():
            console.error(
                "The module '{}' does not exist in your data. Exiting.",
                module)

    if not module_name:
        console.info("Plotting all modules in your input data.")
    # this should work but we need to check before whether any of the entered
    # names are faulty/don't exist
    if module_name:
        df = df[df["module"].str.contains("|".join(module_name))]

    if df.empty:
        console.error("Your selections contained no benchmarking information. "
                      "Are you sure all your selections are correct?")

    return df
def test_console_info():
    """Test the output of console.info()."""
    fh = StringIO()
    console.info("You have been informed.", filehandler=fh)
    assert fh.getvalue() == "You have been informed.\n"
Exemple #10
0
def print_possible_hosts():
    all_hosts = get_possible_hosts()
    console.info("Available host templates:")
    for host in all_hosts:
        console.info(host)
Exemple #11
0
def print_dataframe(df, columns):
    """Print a nicely formatted shortened DataFrame."""
    table = df.copy()
    table.columns = columns
    table = tabulate(table, headers="keys", tablefmt="psql", showindex=False)
    console.info(table, newlines=True)
Exemple #12
0
def do_submit(directory, force_restart, yes):
    """Submit the benchmarks."""
    bundle = dtr.discover(directory)

    # Exit if no bundles were found in the current directory.
    if not bundle:
        console.error("No benchmarks found.")

    grouped_bundles = bundle.categories.groupby("started")
    try:
        bundles_not_yet_started = grouped_bundles[False]
    except KeyError:
        bundles_not_yet_started = None
    if not bundles_not_yet_started and not force_restart:
        console.error(
            "All generated benchmarks were already started once. "
            "You can force a restart with {}.",
            "--force",
        )

    # Start all benchmark simulations if a restart was requested. Otherwise
    # only start the ones that were not run yet.
    bundles_to_start = bundle
    if not force_restart:
        bundles_to_start = bundles_not_yet_started

    benchmark_version = VersionFactory(
        categories=bundles_to_start.categories).version_class

    df = parse_bundle(
        bundles_to_start,
        columns=benchmark_version.submit_categories,
        sort_values_by=benchmark_version.analyze_sort,
        discard_performance=True,
    )

    # Reformat NaN values nicely into question marks.
    df_to_print = df.replace(np.nan, "?")

    columns_to_drop = ["ncores", "version"]
    df_to_print = df.drop(columns=columns_to_drop)

    # Consolidate the data by grouping on the number of nodes and print to the
    # user as an overview.
    consolidated_df = consolidate_dataframe(
        df_to_print, columns=benchmark_version.consolidate_categories)
    print_dataframe(
        consolidated_df,
        columns=map_columns(
            map_dict=benchmark_version.category_mapping,
            columns=benchmark_version.generate_printing[1:],
        ),
    )

    # Ask the user to confirm whether they want to submit the benchmarks
    if yes:
        console.info("The above benchmarks will be submitted.")
    elif not click.confirm(
            "The above benchmarks will be submitted. Continue?"):
        console.error("Exiting. No benchmarks submitted.")

    batch_cmd = get_batch_command()
    console.info("Submitting a total of {} benchmarks.", len(bundles_to_start))
    for sim in bundles_to_start:
        # Remove files generated by previous mdbenchmark run
        if force_restart:
            engine = detect_md_engine(sim.categories["module"])
            cleanup_before_restart(engine=engine, sim=sim)
        sim.categories["started"] = True
        os.chdir(sim.abspath)
        subprocess.call([batch_cmd, "bench.job"])
    console.info(
        "Submitted all benchmarks. Run {} once they are finished to get the results.",
        "mdbenchmark analyze",
    )
Exemple #13
0
def do_generate(
    name,
    cpu,
    gpu,
    module,
    host,
    min_nodes,
    max_nodes,
    time,
    skip_validation,
    job_name,
    yes,
    physical_cores,
    logical_cores,
    number_of_ranks,
    enable_hyperthreading,
    multidir,
):
    """Generate a bunch of benchmarks."""

    # Instantiate the version we are going to use
    benchmark_version = Version3Categories()

    # Validate the CPU and GPU flags
    validate_cpu_gpu_flags(cpu, gpu)

    # Validate the number of nodes
    validate_number_of_nodes(min_nodes=min_nodes, max_nodes=max_nodes)

    if logical_cores < physical_cores:
        console.error(
            "The number of logical cores cannot be smaller than the number of physical cores."
        )

    if physical_cores and not logical_cores:
        console.warn("Assuming logical_cores = 2 * physical_cores")
        logical_cores = 2 * physical_cores

    if physical_cores and logical_cores:
        processor = Processor(physical_cores=physical_cores,
                              logical_cores=logical_cores)
    else:
        processor = Processor()

    # Hyperthreading check
    if enable_hyperthreading and not processor.supports_hyperthreading:
        console.error(
            "The processor of this machine does not support hyperthreading.")

    if not number_of_ranks:
        number_of_ranks = (processor.physical_cores, )

    # Validate number of simulations
    validate_number_of_simulations(multidir, min_nodes, max_nodes,
                                   number_of_ranks)

    # Grab the template name for the host. This should always work because
    # click does the validation for us
    template = utils.retrieve_host_template(host)

    # Warn the user that NAMD support is still experimental.
    if any(["namd" in m for m in module]):
        console.warn(NAMD_WARNING, "--gpu")

    # Stop if we cannot find any modules. If the user specified multiple
    # modules, we will continue with only the valid ones.
    modules = mdengines.normalize_modules(module, skip_validation)
    if not modules:
        console.error("No requested modules available!")

    # Check if all needed files exist. Throw an error if they do not.
    validate_required_files(name=name, modules=modules)

    # Validate that we can use the number of ranks and threads.
    # We can continue, if no ValueError is thrown
    for ranks in number_of_ranks:
        try:
            processor.get_ranks_and_threads(
                ranks, with_hyperthreading=enable_hyperthreading)
        except ValueError as e:
            console.error(e)

    # Create all benchmark combinations and put them into a DataFrame
    data = construct_generate_data(
        name,
        job_name,
        modules,
        host,
        template,
        cpu,
        gpu,
        time,
        min_nodes,
        max_nodes,
        processor,
        number_of_ranks,
        enable_hyperthreading,
        multidir,
    )
    df = pd.DataFrame(data, columns=benchmark_version.generate_categories)

    # Consolidate the data by grouping on the number of nodes and print to the
    # user as an overview.
    consolidated_df = consolidate_dataframe(
        df, columns=benchmark_version.consolidate_categories)
    print_dataframe(
        consolidated_df[benchmark_version.generate_printing],
        columns=map_columns(
            map_dict=benchmark_version.category_mapping,
            columns=benchmark_version.generate_printing,
        ),
    )

    # Save the number of benchmarks for later printing
    number_of_benchmarks = df.shape[0]
    # Ask the user for confirmation to generate files.
    # If the user defined `--yes`, we will skip the confirmation immediately.
    if yes:
        console.info(
            "We will generate {} " +
            "{benchmark}.".format(benchmark="benchmark" if number_of_benchmarks
                                  == 1 else "benchmarks"),
            number_of_benchmarks,
        )
    elif not click.confirm("We will generate {} benchmarks. Continue?".format(
            number_of_benchmarks)):
        console.error("Exiting. No benchmarks were generated.")

    # Generate the benchmarks
    with click.progressbar(
            df.iterrows(),
            length=number_of_benchmarks,
            show_pos=True,
            label="Generating benchmarks",
    ) as bar:
        for _, row in bar:
            relative_path, file_basename = os.path.split(row["name"])
            mappings = benchmark_version.generate_mapping
            kwargs = {"name": file_basename, "relative_path": relative_path}
            for key, value in mappings.items():
                kwargs[value] = row[key]

            write_benchmark(**kwargs)

    # Finish up by telling the user how to submit the benchmarks
    console.info(
        "Finished! You can submit the jobs with {}.",
        "mdbenchmark submit",
    )
Exemple #14
0
def do_generate(
    name,
    cpu,
    gpu,
    module,
    host,
    min_nodes,
    max_nodes,
    time,
    skip_validation,
    job_name,
    yes,
):
    """Generate a bunch of benchmarks."""
    # Validate the CPU and GPU flags
    validate_cpu_gpu_flags(cpu, gpu)

    # Validate the number of nodes
    validate_number_of_nodes(min_nodes=min_nodes, max_nodes=max_nodes)

    # Grab the template name for the host. This should always work because
    # click does the validation for us
    template = utils.retrieve_host_template(host)

    # Warn the user that NAMD support is still experimental.
    if any(["namd" in m for m in module]):
        console.warn(NAMD_WARNING, "--gpu")

    module = mdengines.normalize_modules(module, skip_validation)

    # If several modules were given and we only cannot find one of them, we
    # continue.
    if not module:
        console.error("No requested modules available!")

    df_overview = pd.DataFrame(columns=[
        "name",
        "job_name",
        "base_directory",
        "template",
        "engine",
        "module",
        "nodes",
        "run time [min]",
        "gpu",
        "host",
    ])

    i = 1
    for m in module:
        # Here we detect the MD engine (supported: GROMACS and NAMD).
        engine = mdengines.detect_md_engine(m)

        # Check if all needed files exist. Throw an error if they do not.
        engine.check_input_file_exists(name)

        gpu_cpu = {"cpu": cpu, "gpu": gpu}
        for pu, state in sorted(gpu_cpu.items()):
            if not state:
                continue

            directory = "{}_{}".format(host, m)
            gpu = False
            gpu_string = ""
            if pu == "gpu":
                gpu = True
                directory += "_gpu"
                gpu_string = " with GPUs"

            console.info("Creating benchmark system for {}.", m + gpu_string)

            base_directory = dtr.Tree(directory)

            for nodes in range(min_nodes, max_nodes + 1):
                df_overview.loc[i] = [
                    name,
                    job_name,
                    base_directory,
                    template,
                    engine,
                    m,
                    nodes,
                    time,
                    gpu,
                    host,
                ]
                i += 1

    console.info("{}", "Benchmark Summary:")

    df_short = ConsolidateDataFrame(df_overview)
    PrintDataFrame(df_short)

    if yes:
        console.info("Generating the above benchmarks.")
    elif not click.confirm(
            "The above benchmarks will be generated. Continue?"):
        console.error("Exiting. No benchmarks generated.")

    for _, row in df_overview.iterrows():
        relative_path, file_basename = os.path.split(row["name"])
        write_benchmark(
            engine=row["engine"],
            base_directory=row["base_directory"],
            template=row["template"],
            nodes=row["nodes"],
            gpu=row["gpu"],
            module=row["module"],
            name=file_basename,
            relative_path=relative_path,
            job_name=row["job_name"],
            host=row["host"],
            time=row["run time [min]"],
        )

    # Provide some output for the user
    console.info(
        "Finished generating all benchmarks.\n"
        "You can now submit the jobs with {}.",
        "mdbenchmark submit",
    )