def validate_hosts(ctx, param, host=None): """Callback to validate the hostname received as input. If we were not given a hostname, we first try to guess it via `utils.guess_host`. If this fails, we give up and throw an error. Otherwise we compare the provided/guessed host with the list of available templates. If the hostname matches the template name, we continue by returning the hostname. """ if host is None: host = utils.guess_host() if host is None: raise click.BadParameter( "Could not guess host. Please provide a value explicitly.", param_hint='"--host"', ) known_hosts = utils.get_possible_hosts() if host not in known_hosts: console.info("Could not find template for host '{}'.", host) utils.print_possible_hosts() # TODO: Raise some appropriate error here ctx.exit() return return host
def migrate_to_datreant(directory): """Perform the `mdsynthesis` to `datreant` migration.""" bundles = search_mdsynthesis_sim_files(directory) if bundles: console.info("Converting old benchmark metadata to new format!") convert_to_datreant(bundles) console.info("Finished converting old benchmarks to new format!") return
def do_submit(directory, force_restart, yes): """Submit the benchmarks.""" # Migrate from MDBenchmark<2 to MDBenchmark=>2 mds_to_dtr.migrate_to_datreant(directory) bundle = dtr.discover(directory) # Exit if no bundles were found in the current directory. if not bundle: console.error("No benchmarks found.") grouped_bundles = bundle.categories.groupby("started") try: bundles_not_yet_started = grouped_bundles[False] except KeyError: bundles_not_yet_started = None if not bundles_not_yet_started and not force_restart: console.error( "All generated benchmarks were already started once. " "You can force a restart with {}.", "--force", ) # Start all benchmark simulations if a restart was requested. Otherwise # only start the ones that were not run yet. bundles_to_start = bundle if not force_restart: bundles_to_start = bundles_not_yet_started df = DataFrameFromBundle(bundles_to_start) # Reformat NaN values nicely into question marks. df_to_print = df.replace(np.nan, "?") df_to_print = df.drop(columns=["ns/day", "ncores"]) console.info("{}", "Benchmark Summary:") df_short = ConsolidateDataFrame(df_to_print) PrintDataFrame(df_short) # Ask the user to confirm whether they want to submit the benchmarks if yes: console.info("The above benchmarks will be submitted.") elif not click.confirm("The above benchmarks will be submitted. Continue?"): console.error("Exiting. No benchmarks submitted.") batch_cmd = get_batch_command() console.info("Submitting a total of {} benchmarks.", len(bundles_to_start)) for sim in bundles_to_start: # Remove files generated by previous mdbenchmark run if force_restart: engine = detect_md_engine(sim.categories["module"]) cleanup_before_restart(engine=engine, sim=sim) sim.categories["started"] = True os.chdir(sim.abspath) subprocess.call([batch_cmd, "bench.job"]) console.info( "Submitted all benchmarks. Run {} once they are finished to get the results.", "mdbenchmark analyze", )
def _guess_version(self, categories): console.info("Setting up...") try: if "module" in categories and "version" in categories: # Versions >=3 have both a "module" and "version" key self.version = "3" elif "module" in categories: # Version 2 uses "module", but has no "version" key self.version = "2" else: # We found a version that is not enumerated above self.version = "next" except TypeError: # If we point datreant to an empty or non-existent directory, it # will throw an error. Catch it and set some default version. self.version = "3"
def do_plot( csv, output_name, output_format, template, module, gpu, cpu, plot_cores, fit, font_size, dpi, xtick_step, watermark, ): """Creates plots of benchmarks.""" if not csv: raise click.BadParameter( "You must specify at least one CSV file.", param_hint='"--csv"' ) df = pd.concat([pd.read_csv(c) for c in csv]) performance_column = "performance" if "performance" in df.columns else "ns/day" df = filter_dataframe_for_plotting(df, template, module, gpu, cpu) # Exit if there is no performance data if df[performance_column].isnull().all(): console.error("There is no performance data to plot.") rcParams["font.size"] = font_size fig = Figure() FigureCanvas(fig) ax = fig.add_subplot(111) ax = plot_over_group( df=df, plot_cores=plot_cores, fit=fit, performance_column=performance_column, ax=ax, ) # Update xticks selection = "ncores" if plot_cores else "nodes" min_x = df[selection].min() if plot_cores else 1 max_x = df[selection].max() xticks_steps = min_x xticks = np.arange(min_x, max_x + min_x, xticks_steps) step = get_xsteps(xticks.size, min_x, plot_cores, xtick_step) ax.set_xticks(xticks[::step]) xdiff = min_x * 0.5 * step ax.set_xlim(min_x - xdiff, max_x + xdiff) # Update yticks max_y = df[performance_column].max() or 50 yticks_steps = int(((max_y + 1) // 10)) if yticks_steps == 0: yticks_steps = 1 yticks = np.arange(0, max_y + (max_y * 0.25), yticks_steps) ax.set_yticks(yticks) ax.set_ylim(0, max_y + (max_y * 0.25)) # Add watermark if watermark: ax.text(0.025, 0.925, "MDBenchmark", transform=ax.transAxes, alpha=0.3) legend = ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.175)) plt.tight_layout() if output_name is None and len(csv) == 1: csv_string = csv[0].split(".")[0] output_name = "{}.{}".format(csv_string, output_format) elif output_name is None and len(csv) != 1: output_name = generate_output_name(output_format) elif not output_name.endswith(".{}".format(output_format)): output_name = "{}.{}".format(output_name, output_format) fig.savefig( output_name, type=output_format, bbox_extra_artists=(legend,), bbox_inches="tight", dpi=dpi, ) console.info("The plot was saved as '{}'.", output_name)
def filter_dataframe_for_plotting(df, host_name, module_name, gpu, cpu): if gpu and cpu: console.info("Plotting GPU and CPU data.") elif gpu and not cpu: df = df[df.gpu] console.info("Plotting GPU data only.") elif cpu and not gpu: df = df[~df.gpu] console.info("Plotting CPU data only.") elif not cpu and not gpu: console.error("CPU and GPU not set. Nothing to plot. Exiting.") if df.empty: console.error("Your filtering led to an empty dataset. Exiting.") df_filtered_hosts = df[df["host"].isin(host_name)] df_unique_hosts = np.unique(df_filtered_hosts["host"]) if df_unique_hosts.size != len(host_name): console.error( "Could not find all provided hosts. Available hosts are: {}".format( ", ".join(np.unique(df["host"])) ) ) if not host_name: console.info("Plotting all hosts in input file.") else: df = df_filtered_hosts console.info( "Data for the following hosts will be plotted: {}".format( ", ".join(df_unique_hosts) ) ) for module in module_name: if module in SUPPORTED_ENGINES.keys(): console.info("Plotting all modules for engine '{}'.", module) elif module in df["module"].tolist(): console.info("Plotting module '{}'.", module) elif module not in df["module"].tolist(): console.error( "The module '{}' does not exist in your data. Exiting.", module ) if not module_name: console.info("Plotting all modules in your input data.") if module_name: df = df[df["module"].str.contains("|".join(module_name))] if df.empty: console.error( "Your selections contained no benchmarking information. " "Are you sure all your selections are correct?" ) return df
def do_plot( csv, output_name, output_format, template, module, gpu, cpu, plot_cores, fit, font_size, dpi, xtick_step, watermark, ): """Creates plots of benchmarks.""" if not csv: raise click.BadParameter("You must specify at least one CSV file.", param_hint='"--csv"') df = pd.concat([pd.read_csv(c, index_col=0) for c in csv]).dropna() df = filter_dataframe_for_plotting(df, template, module, gpu, cpu) mpl_rcParams["font.size"] = font_size fig = Figure() FigureCanvas(fig) ax = fig.add_subplot(111) ax = plot_over_group(df=df, plot_cores=plot_cores, fit=fit, ax=ax) # Update xticks selection = "ncores" if plot_cores else "nodes" min_x = df[selection].min() if plot_cores else 1 max_x = df[selection].max() xticks_steps = min_x xticks = np.arange(min_x, max_x + min_x, xticks_steps) step = get_xsteps(xticks.size, min_x, plot_cores, xtick_step) ax.set_xticks(xticks[::step]) xdiff = min_x * 0.5 * step ax.set_xlim(min_x - xdiff, max_x + xdiff) # Update yticks max_y = df["ns/day"].max() or 50 yticks_steps = ((max_y + 1) / 10).astype(int) yticks = np.arange(0, max_y + (max_y * 0.25), yticks_steps) ax.set_yticks(yticks) ax.set_ylim(0, max_y + (max_y * 0.25)) # Add watermark if watermark: ax.text(0.025, 0.925, "MDBenchmark", transform=ax.transAxes, alpha=0.3) lgd = ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.175)) plt.tight_layout() if output_name is None and len(csv) == 1: csv_string = csv[0].split(".")[0] output_name = "{}.{}".format(csv_string, output_format) elif output_name is None and len(csv) != 1: output_name = generate_output_name(output_format) elif not output_name.endswith(".{}".format(output_format)): output_name = "{}.{}".format(output_name, output_format) # tight alone does not consider the legend if it is outside the plot. # therefore i add it manually as extra artist. This way we don't get problems # with the variability of individual lines which are to be plotted fig.savefig( output_name, type=output_format, bbox_extra_artists=(lgd, ), bbox_inches="tight", dpi=dpi, ) console.info("Your file was saved as '{}' in the working directory.", output_name)
def filter_dataframe_for_plotting(df, host_name, module_name, gpu, cpu): # gpu/cpu can be plotted together or separately if gpu and cpu: # if no flags are given by the user or both are set everything is plotted console.info("Plotting GPU and CPU data.") elif gpu and not cpu: df = df[df.gpu] console.info("Plotting GPU data only.") elif cpu and not gpu: df = df[~df.gpu] console.info("Plotting CPU data only.") elif not cpu and not gpu: console.error("CPU and GPU not set. Nothing to plot. Exiting.") if df.empty: console.error("Your filtering led to an empty dataset. Exiting.") df_filtered_hosts = df[df["host"].isin(host_name)] df_unique_hosts = np.unique(df_filtered_hosts["host"]) if df_unique_hosts.size != len(host_name): console.error( "Could not find all provided hosts. Available hosts are: {}". format(", ".join(np.unique(df["host"])))) if not host_name: console.info("Plotting all hosts in input file.") else: df = df_filtered_hosts console.info("Data for the following hosts will be plotted: {}".format( ", ".join(df_unique_hosts))) for module in module_name: if module in ["gromacs", "namd"]: console.info("Plotting all modules for engine '{}'.", module) elif module in df["module"].tolist(): console.info("Plotting module '{}'.", module) elif module not in df["module"].tolist(): console.error( "The module '{}' does not exist in your data. Exiting.", module) if not module_name: console.info("Plotting all modules in your input data.") # this should work but we need to check before whether any of the entered # names are faulty/don't exist if module_name: df = df[df["module"].str.contains("|".join(module_name))] if df.empty: console.error("Your selections contained no benchmarking information. " "Are you sure all your selections are correct?") return df
def test_console_info(): """Test the output of console.info().""" fh = StringIO() console.info("You have been informed.", filehandler=fh) assert fh.getvalue() == "You have been informed.\n"
def print_possible_hosts(): all_hosts = get_possible_hosts() console.info("Available host templates:") for host in all_hosts: console.info(host)
def print_dataframe(df, columns): """Print a nicely formatted shortened DataFrame.""" table = df.copy() table.columns = columns table = tabulate(table, headers="keys", tablefmt="psql", showindex=False) console.info(table, newlines=True)
def do_submit(directory, force_restart, yes): """Submit the benchmarks.""" bundle = dtr.discover(directory) # Exit if no bundles were found in the current directory. if not bundle: console.error("No benchmarks found.") grouped_bundles = bundle.categories.groupby("started") try: bundles_not_yet_started = grouped_bundles[False] except KeyError: bundles_not_yet_started = None if not bundles_not_yet_started and not force_restart: console.error( "All generated benchmarks were already started once. " "You can force a restart with {}.", "--force", ) # Start all benchmark simulations if a restart was requested. Otherwise # only start the ones that were not run yet. bundles_to_start = bundle if not force_restart: bundles_to_start = bundles_not_yet_started benchmark_version = VersionFactory( categories=bundles_to_start.categories).version_class df = parse_bundle( bundles_to_start, columns=benchmark_version.submit_categories, sort_values_by=benchmark_version.analyze_sort, discard_performance=True, ) # Reformat NaN values nicely into question marks. df_to_print = df.replace(np.nan, "?") columns_to_drop = ["ncores", "version"] df_to_print = df.drop(columns=columns_to_drop) # Consolidate the data by grouping on the number of nodes and print to the # user as an overview. consolidated_df = consolidate_dataframe( df_to_print, columns=benchmark_version.consolidate_categories) print_dataframe( consolidated_df, columns=map_columns( map_dict=benchmark_version.category_mapping, columns=benchmark_version.generate_printing[1:], ), ) # Ask the user to confirm whether they want to submit the benchmarks if yes: console.info("The above benchmarks will be submitted.") elif not click.confirm( "The above benchmarks will be submitted. Continue?"): console.error("Exiting. No benchmarks submitted.") batch_cmd = get_batch_command() console.info("Submitting a total of {} benchmarks.", len(bundles_to_start)) for sim in bundles_to_start: # Remove files generated by previous mdbenchmark run if force_restart: engine = detect_md_engine(sim.categories["module"]) cleanup_before_restart(engine=engine, sim=sim) sim.categories["started"] = True os.chdir(sim.abspath) subprocess.call([batch_cmd, "bench.job"]) console.info( "Submitted all benchmarks. Run {} once they are finished to get the results.", "mdbenchmark analyze", )
def do_generate( name, cpu, gpu, module, host, min_nodes, max_nodes, time, skip_validation, job_name, yes, physical_cores, logical_cores, number_of_ranks, enable_hyperthreading, multidir, ): """Generate a bunch of benchmarks.""" # Instantiate the version we are going to use benchmark_version = Version3Categories() # Validate the CPU and GPU flags validate_cpu_gpu_flags(cpu, gpu) # Validate the number of nodes validate_number_of_nodes(min_nodes=min_nodes, max_nodes=max_nodes) if logical_cores < physical_cores: console.error( "The number of logical cores cannot be smaller than the number of physical cores." ) if physical_cores and not logical_cores: console.warn("Assuming logical_cores = 2 * physical_cores") logical_cores = 2 * physical_cores if physical_cores and logical_cores: processor = Processor(physical_cores=physical_cores, logical_cores=logical_cores) else: processor = Processor() # Hyperthreading check if enable_hyperthreading and not processor.supports_hyperthreading: console.error( "The processor of this machine does not support hyperthreading.") if not number_of_ranks: number_of_ranks = (processor.physical_cores, ) # Validate number of simulations validate_number_of_simulations(multidir, min_nodes, max_nodes, number_of_ranks) # Grab the template name for the host. This should always work because # click does the validation for us template = utils.retrieve_host_template(host) # Warn the user that NAMD support is still experimental. if any(["namd" in m for m in module]): console.warn(NAMD_WARNING, "--gpu") # Stop if we cannot find any modules. If the user specified multiple # modules, we will continue with only the valid ones. modules = mdengines.normalize_modules(module, skip_validation) if not modules: console.error("No requested modules available!") # Check if all needed files exist. Throw an error if they do not. validate_required_files(name=name, modules=modules) # Validate that we can use the number of ranks and threads. # We can continue, if no ValueError is thrown for ranks in number_of_ranks: try: processor.get_ranks_and_threads( ranks, with_hyperthreading=enable_hyperthreading) except ValueError as e: console.error(e) # Create all benchmark combinations and put them into a DataFrame data = construct_generate_data( name, job_name, modules, host, template, cpu, gpu, time, min_nodes, max_nodes, processor, number_of_ranks, enable_hyperthreading, multidir, ) df = pd.DataFrame(data, columns=benchmark_version.generate_categories) # Consolidate the data by grouping on the number of nodes and print to the # user as an overview. consolidated_df = consolidate_dataframe( df, columns=benchmark_version.consolidate_categories) print_dataframe( consolidated_df[benchmark_version.generate_printing], columns=map_columns( map_dict=benchmark_version.category_mapping, columns=benchmark_version.generate_printing, ), ) # Save the number of benchmarks for later printing number_of_benchmarks = df.shape[0] # Ask the user for confirmation to generate files. # If the user defined `--yes`, we will skip the confirmation immediately. if yes: console.info( "We will generate {} " + "{benchmark}.".format(benchmark="benchmark" if number_of_benchmarks == 1 else "benchmarks"), number_of_benchmarks, ) elif not click.confirm("We will generate {} benchmarks. Continue?".format( number_of_benchmarks)): console.error("Exiting. No benchmarks were generated.") # Generate the benchmarks with click.progressbar( df.iterrows(), length=number_of_benchmarks, show_pos=True, label="Generating benchmarks", ) as bar: for _, row in bar: relative_path, file_basename = os.path.split(row["name"]) mappings = benchmark_version.generate_mapping kwargs = {"name": file_basename, "relative_path": relative_path} for key, value in mappings.items(): kwargs[value] = row[key] write_benchmark(**kwargs) # Finish up by telling the user how to submit the benchmarks console.info( "Finished! You can submit the jobs with {}.", "mdbenchmark submit", )
def do_generate( name, cpu, gpu, module, host, min_nodes, max_nodes, time, skip_validation, job_name, yes, ): """Generate a bunch of benchmarks.""" # Validate the CPU and GPU flags validate_cpu_gpu_flags(cpu, gpu) # Validate the number of nodes validate_number_of_nodes(min_nodes=min_nodes, max_nodes=max_nodes) # Grab the template name for the host. This should always work because # click does the validation for us template = utils.retrieve_host_template(host) # Warn the user that NAMD support is still experimental. if any(["namd" in m for m in module]): console.warn(NAMD_WARNING, "--gpu") module = mdengines.normalize_modules(module, skip_validation) # If several modules were given and we only cannot find one of them, we # continue. if not module: console.error("No requested modules available!") df_overview = pd.DataFrame(columns=[ "name", "job_name", "base_directory", "template", "engine", "module", "nodes", "run time [min]", "gpu", "host", ]) i = 1 for m in module: # Here we detect the MD engine (supported: GROMACS and NAMD). engine = mdengines.detect_md_engine(m) # Check if all needed files exist. Throw an error if they do not. engine.check_input_file_exists(name) gpu_cpu = {"cpu": cpu, "gpu": gpu} for pu, state in sorted(gpu_cpu.items()): if not state: continue directory = "{}_{}".format(host, m) gpu = False gpu_string = "" if pu == "gpu": gpu = True directory += "_gpu" gpu_string = " with GPUs" console.info("Creating benchmark system for {}.", m + gpu_string) base_directory = dtr.Tree(directory) for nodes in range(min_nodes, max_nodes + 1): df_overview.loc[i] = [ name, job_name, base_directory, template, engine, m, nodes, time, gpu, host, ] i += 1 console.info("{}", "Benchmark Summary:") df_short = ConsolidateDataFrame(df_overview) PrintDataFrame(df_short) if yes: console.info("Generating the above benchmarks.") elif not click.confirm( "The above benchmarks will be generated. Continue?"): console.error("Exiting. No benchmarks generated.") for _, row in df_overview.iterrows(): relative_path, file_basename = os.path.split(row["name"]) write_benchmark( engine=row["engine"], base_directory=row["base_directory"], template=row["template"], nodes=row["nodes"], gpu=row["gpu"], module=row["module"], name=file_basename, relative_path=relative_path, job_name=row["job_name"], host=row["host"], time=row["run time [min]"], ) # Provide some output for the user console.info( "Finished generating all benchmarks.\n" "You can now submit the jobs with {}.", "mdbenchmark submit", )