def get_shared_node_layout(n_writers, n_readers): nc = SummitNode() for i in range(n_writers): nc.cpu[i] = "writer:{}".format(i) for i in range(n_readers): nc.cpu[i + n_writers] = "reader:{}".format(i) return [nc]
def share_nodes_21to21(): """ Create a shared node layout where the simulation and analysis ranks share compute nodes """ shared_node = SummitNode() for i in range(21): shared_node.cpu[i] = "simulation:{}".format(i) shared_node.cpu[21 + i] = "rdf_calc:{}".format(i) return [shared_node]
def all_sim_nodes(): """ Create a node layout where the simulation and the analysis ranks reside on separate nodes. """ # Create a node layout for the simulation # Lets have 42 ranks consume all 42 cores on each summit node sim_node = SummitNode() for i in range(42): sim_node.cpu[i] = "sim_inline_rdf_calc:{}".format(i) # Return a list object return [sim_node]
def separate(nw, wn): """ Create separate nodes on Summit for the writer processes. Spawns 32 writers on each node, 16 on each socket Input args: nw = num writers wn = writer app name """ node_w = SummitNode() for i in range(nw // 2): node_w.cpu[i] = "{}:{}".format(wn, i) for i in range(nw // 2): node_w.cpu[i + 21] = "{}:{}".format(wn, i + nw // 2) return [node_w]
def shared(nw, wn): """ Creates a shared node layout for Summit with nw writers/node. Spreads writer ranks evenly across the 2 sockets. Input args: nw = num writers wn = writer app name """ n = SummitNode() for i in range(nw // 2): n.cpu[i] = "{}:{}".format(wn, i) for i in range(nw // 2): n.cpu[i + 21] = "{}:{}".format(wn, i + nw // 2) return [n]
def get_separate_node_layout(n_writers, n_readers): nc_w = SummitNode() for i in range(n_writers): nc_w.cpu[i] = "writer:{}".format(i) nc_r = SummitNode() for i in range(n_readers): nc_r.cpu[i] = "reader:{}".format(i) return [nc_w, nc_r]
def separate_nodes(): """ Create a node layout where the simulation and the analysis ranks reside on separate nodes. """ # Create a node layout for the simulation # Lets have max 20 ranks evenly spread out between 2 sockets sim_node = SummitNode() for i in range(10): sim_node.cpu[i] = "simulation:{}".format(i) sim_node.cpu[21 + i] = "simulation:{}".format(10 + i) # Create a node layout for the analysis. # Lets have max 10 ranks evenly spread out analysis_node = SummitNode() for i in range(5): analysis_node.cpu[i] = "pdf_calc:{}".format(i) analysis_node.cpu[21 + i] = "pdf_calc:{}".format(5 + i) # Return a list object return [sim_node, analysis_node]
def share_nodes_sockets(): """ Create a shared node layout where the simulation and analysis ranks share compute nodes. Furthermore, they share sockets of the node. """ shared_sockets = SummitNode() for i in range(10): shared_sockets.cpu[i] = "simulation:{}".format(i) shared_sockets.cpu[21 + i] = "simulation:{}".format(10 + i) for i in range(10): shared_sockets.cpu[10 + i] = "pdf_calc:{}".format(i) shared_sockets.cpu[21 + 10 + i] = "pdf_calc:{}".format(10 + i) return [shared_sockets]
def separate(nx, ng, nc, x, g, c): node_x = SummitNode() for i in range(nx // 2): node_x.cpu[i] = "{}:{}".format(x, i) for i in range(nx // 2): node_x.cpu[i + 21] = "{}:{}".format(x, i + nx // 2) node_g = SummitNode() for i in range(ng // 2): node_g.cpu[i] = "{}:{}".format(g, i) for i in range(ng // 2): node_g.cpu[i + 21] = "{}:{}".format(g, i + ng // 2) node_c = SummitNode() for i in range(nc // 2): node_c.cpu[i] = "{}:{}".format(c, i) for i in range(nc // 2): node_c.cpu[i + 21] = "{}:{}".format(c, i + nc // 2) return [node_x, node_g, node_c]
class GrayScott(Campaign): name = "Gray-Scott" codes = [("gray-scott", dict(exe="gray-scott"))] app_config_scripts = {'summit': 'env_summit.sh'} supported_machines = ['local', 'theta', 'summit'] scheduler_options = { "theta": { "queue": "debug-flat-quad", "project": "CSC249ADCD01", }, "summit": { "project": "csc299", } } kill_on_partial_failure = True umask = '027' nprocs = 6 shared_node = SummitNode() for i in range(nprocs): shared_node.cpu[i] = "gray-scott:{}".format(i) shared_node.gpu[i] = ["gray-scott:{}".format(i)] shared_node_layout = [shared_node] L = [256] noise = [1.e-5] Du = [0.1, 0.2, 0.3] Dv = [0.05, 0.1, 0.15] F = [0.01, 0.02, 0.03] k = [0.048, 0.04, 0.06] sweep_parameters = \ [ p.ParamCmdLineArg("gray-scott", "settings", 1, ["settings.json"]), p.ParamConfig("gray-scott", "L", "settings.json", "L", L), p.ParamConfig("gray-scott", "noise", "settings.json", "noise", noise), p.ParamConfig("gray-scott", "Du", "settings.json", "Du", Du), p.ParamConfig("gray-scott", "Dv", "settings.json", "Dv", Dv), p.ParamConfig("gray-scott", "F", "settings.json", "F", F), p.ParamConfig("gray-scott", "k", "settings.json", "k", k), p.ParamRunner('gray-scott', 'nprocs', [nprocs] ), ] sweep = p.Sweep(parameters=sweep_parameters, node_layout={'summit': shared_node_layout}) nodes = len(noise) * len(Du) * len(Dv) * len(F) * len(k) sweeps = \ [ p.SweepGroup( name = "gs", walltime = timedelta(minutes=60), nodes = nodes, component_subdirs = True, component_inputs = { 'gray-scott': ['settings.json','adios2.xml'], }, parameter_groups = [sweep] ) ]
class ProducerConsumer(Campaign): # A name for the campaign name = "coupling-example" # WORKFLOW SETUP #--------------- # A list of the codes that will be part of the workflow # If there is an adios xml file associated with the codes, list it here # 'sleep_after' represents the time gap after which the next code is spawned # Use runner_override to run the code without the default launcher (mpirun/aprun/jsrun etc.). This runs the # code as a serial application codes = [("producer", dict(exe="producer.py", adios_xml_file='adios2.xml', sleep_after=5)), ("mean_calc", dict(exe="mean_calculator.py", adios_xml_file='adios2.xml', runner_override=False))] # CAMPAIGN SETTINGS #------------------ # A list of machines that this campaign is supported on supported_machines = ['local', 'titan', 'theta', 'summit'] # Option to kill an experiment (just one experiment, not the full sweep or campaign) if one of the codes fails kill_on_partial_failure = True # Some pre-processing in the experiment directory # This is performed when the campaign directory is created (before the campaign is launched) run_dir_setup_script = None # A post-processing script to be run in the experiment directory after the experiment completes # For example, removing some large files after the experiment is done run_post_process_script = None # umask applied to your directory in the campaign so that colleagues can view files umask = '027' # Scheduler information: job queue, account-id etc. Leave it to None if running on a local machine scheduler_options = { 'theta': { 'project': '', 'queue': 'batch' }, 'summit': { 'project': 'CSC299' } } # Setup your environment. Loading modules, setting the LD_LIBRARY_PATH etc. # Ensure this script is executable app_config_scripts = {'local': 'setup.sh', 'summit': 'env_setup.sh'} # PARAMETER SWEEPS #----------------- # Setup how the workflow is run, and what values to 'sweep' over # Use ParamCmdLineArg to setup a command line arg, ParamCmdLineOption to setup a command line option, and so on. sweep1_parameters = [ p.ParamRunner('producer', 'nprocs', [128]), p.ParamRunner('mean_calc', 'nprocs', [36]), p.ParamCmdLineArg('producer', 'array_size_per_pe', 1, [1024 * 1024]), # 1M, 2M, 10M p.ParamCmdLineArg('producer', 'num_steps', 2, [10]), p.ParamADIOS2XML('producer', 'staging', 'producer', 'engine', [{ "SST": {} }]), ] shared_node = SummitNode() for i in range(18): shared_node.cpu[i] = "producer:{}".format(math.floor(i / 6)) shared_node.cpu[i + 21] = "producer:{}".format(math.floor( (i + 18) / 6)) for i in range(3): shared_node.cpu[i + 18] = "mean_calc:0" shared_node.cpu[i + 18 + 21] = "mean_calc:0" for i in range(6): shared_node.gpu[i] = ["producer:{}".format(i)] shared_node_layout = [shared_node] shared_node_1_per_rank = SummitNode() for i in range(18): shared_node_1_per_rank.cpu[i] = "producer:{}".format(i) shared_node_1_per_rank.cpu[i + 21] = "producer:{}".format(i + 18) for i in range(3): shared_node_1_per_rank.cpu[i + 18] = "mean_calc:{}".format(i) shared_node_1_per_rank.cpu[i + 18 + 21] = "mean_calc:{}".format(i + 3) for i in range(6): shared_node_1_per_rank.gpu[i] = ["producer:{}".format(i)] shared_node_layout_2 = [shared_node_1_per_rank] shared_node_shared_gpu = SummitNode() for i in range(18): shared_node_shared_gpu.cpu[i] = "producer:{}".format(math.floor(i / 6)) shared_node_shared_gpu.cpu[i + 21] = "producer:{}".format( math.floor((i + 18) / 6)) for i in range(3): shared_node_shared_gpu.cpu[i + 18] = "mean_calc:0" shared_node_shared_gpu.cpu[i + 18 + 21] = "mean_calc:0" shared_node_shared_gpu.gpu[0] = ["producer:0"] shared_node_shared_gpu.gpu[1] = [ "producer:0", "producer:1", ] shared_node_shared_gpu.gpu[2] = ["producer:0", "producer:1", 'mean_calc:0'] shared_node_layout_3 = [shared_node_shared_gpu] sep_node_producer = SummitNode() sep_node_mean_calc = SummitNode() for i in range(18): sep_node_producer.cpu[i] = "producer:{}".format(math.floor(i / 6)) for i in range(3): sep_node_mean_calc.cpu[i + 18] = "mean_calc:0" sep_node_mean_calc.cpu[i + 18 + 21] = "mean_calc:0" for i in range(3): sep_node_producer.gpu[i] = ["producer:{}".format(i)] sep_node_layout = [sep_node_producer, sep_node_mean_calc] # Create a sweep # node_layout represents no. of processes per node sweep1 = p.Sweep(node_layout={'summit': shared_node_layout}, parameters=sweep1_parameters, rc_dependency=None) sweep2 = p.Sweep(node_layout={'summit': shared_node_layout_2}, parameters=sweep1_parameters, rc_dependency=None) sweep3 = p.Sweep(node_layout={'summit': shared_node_layout_3}, parameters=sweep1_parameters, rc_dependency=None) sweep4 = p.Sweep(node_layout={'summit': sep_node_layout}, parameters=sweep1_parameters, rc_dependency=None) # Create a sweep group from the above sweep. You can place multiple sweeps in the group. # Each group is submitted as a separate job. sweepGroup1 = p.SweepGroup( "sg-1", walltime=300, per_run_timeout=60, parameter_groups=[sweep1, sweep2, sweep3, sweep4], launch_mode='default', # or MPMD # optional: # tau_profiling=True, # tau_tracing=False, # nodes=10, # component_subdirs = True, <-- codes have their own separate workspace in the experiment directory # component_inputs = {'producer': ['some_input_file'], 'norm_calc': [SymLink('some_large_file')] } <-- inputs required by codes # max_procs = 64 <-- max no. of procs to run concurrently. depends on 'nodes' ) # Sweep groups to be activated sweeps = {'summit': [sweepGroup1]}
class GrayScott(Campaign): # A name for the campaign name = "gray_scott" # Define your workflow. Setup the applications that form the workflow. # exe may be an absolute path. # The adios xml file is automatically copied to the campaign directory. # 'runner_override' may be used to launch the code on a login/service node as a serial code # without a runner such as aprun/srun/jsrun etc. codes = [ ("simulation", dict(exe="gray-scott", adios_xml_file='adios2.xml')), ("pdf_calc", dict(exe="pdf_calc", adios_xml_file='adios2.xml', runner_override=False)), ] # List of machines on which this code can be run supported_machines = ['local', 'titan', 'theta', 'summit'] # Kill an experiment right away if any workflow components fail (just the experiment, not the whole group) kill_on_partial_failure = True # Any setup that you may need to do in an experiment directory before the experiment is run run_dir_setup_script = None # A post-process script that is run for every experiment after the experiment completes run_post_process_script = None # Directory permissions for the campaign sub-directories umask = '027' # Options for the underlying scheduler on the target system. Specify the project ID and job queue here. scheduler_options = { 'theta': { 'project': 'CSC249ADCD01', 'queue': 'default' }, 'summit': { 'project': 'csc299' } } # A way to setup your environment before the experiment runs. Export environment variables such as LD_LIBRARY_PATH here. app_config_scripts = { 'local': 'setup.sh', 'theta': 'env_setup.sh', 'summit': 'setup.sh' } # Setup the sweep parameters for a Sweep sweep1_parameters = [ # ParamRunner 'nprocs' specifies the no. of ranks to be spawned p.ParamRunner('simulation', 'nprocs', [512]), # Create a ParamCmdLineArg parameter to specify a command line argument to run the application p.ParamCmdLineArg('simulation', 'settings', 1, ["settings.json"]), # Edit key-value pairs in the json file # Sweep over two values for the F key in the json file. p.ParamConfig('simulation', 'feed_rate_U', 'settings.json', 'F', [0.01, 0.02]), p.ParamConfig('simulation', 'kill_rate_V', 'settings.json', 'k', [0.048]), p.ParamConfig('simulation', 'domain_size', 'settings.json', 'L', [1024]), p.ParamConfig('simulation', 'num_steps', 'settings.json', 'steps', [50]), p.ParamConfig('simulation', 'plot_gap', 'settings.json', 'plotgap', [10]), # Setup an environment variable # p.ParamEnvVar ('simulation', 'openmp', 'OMP_NUM_THREADS', [4]), # Change the engine for the 'SimulationOutput' IO object in the adios xml file to SST for coupling. # As both the applications use the same xml file, you need to do this just once. p.ParamADIOS2XML('simulation', 'SimulationOutput', 'engine', [{ 'SST': {} }]), # Now setup options for the pdf_calc application. # Sweep over four values for the nprocs p.ParamRunner('pdf_calc', 'nprocs', [32, 64]), p.ParamCmdLineArg('pdf_calc', 'infile', 1, ['gs.bp']), p.ParamCmdLineArg('pdf_calc', 'outfile', 2, ['pdf']), ] # Create the node-layout to run on summit # Place the simulation and the analysis codes on separate nodes # On Summit, create a 'node' object and manually map ranks to cpus and gpus using the convention # cpu[index] = code_name:rank_id # Given this node mapping and the 'nprocs' property, Cheetah will automatically spawn the correct no. # of nodes. For example, for the simulation, we have 32 ranks on one node and 512 nprocs, so Cheetah # will create 16 nodes of type 'sim_node' sim_node = SummitNode() pdf_node = SummitNode() for i in range(32): sim_node.cpu[i] = "simulation:{}".format(i) for i in range(32): pdf_node.cpu[i] = "pdf_calc:{}".format(i) separate_node_layout = [sim_node, pdf_node] # Create a Sweep object. This one does not define a node-layout, and thus, all cores of a compute node will be # utilized and mapped to application ranks. sweep1 = p.Sweep(parameters=sweep1_parameters, node_layout={'summit': separate_node_layout}) # Create another Sweep object and set its node-layout to spawn 16 simulation processes per node, and # 4 processes of pdf_calc per node. On Theta, different executables reside on separate nodes as node-sharing # is not permitted on Theta. sweep2_parameters = copy.deepcopy(sweep1_parameters) # Now create a shared node layout where ranks from different codes are placed on the node # Lets place 32 ranks of the simulation and 8 ranks of pdf_calc on the same node shared_node = SummitNode() for i in range(32): shared_node.cpu[i] = "simulation:{}".format(i) for i in range(8): shared_node.cpu[i + 32] = "pdf_calc:{}".format(i) shared_node_layout = [shared_node] sweep2 = p.Sweep(parameters=sweep2_parameters, node_layout={'summit': shared_node_layout}) # Create a SweepGroup and add the above Sweeps. Set batch job properties such as the no. of nodes, sweepGroup1 = p.SweepGroup( "sg-1", # A unique name for the SweepGroup walltime=3600, # Total runtime for the SweepGroup per_run_timeout= 600, # Timeout for each experiment parameter_groups=[sweep1, sweep2], # Sweeps to include in this group launch_mode='default', # Launch mode: default, or MPMD if supported nodes=128, # No. of nodes for the batch job. # rc_dependency={'pdf_calc':'simulation',}, # Specify dependencies between workflow components run_repetitions= 2, # No. of times each experiment in the group must be repeated (Total no. of runs here will be 3) ) # Activate the SweepGroup sweeps = [sweepGroup1]
class ProducerConsumer(Campaign): # A name for the campaign name = "coupling-example" # WORKFLOW SETUP #--------------- # A list of the codes that will be part of the workflow # If there is an adios xml file associated with the codes, list it here # 'sleep_after' represents the time gap after which the next code is spawned # Use runner_override to run the code without the default launcher (mpirun/aprun/jsrun etc.). This runs the # code as a serial application codes = [ ("producer", dict(exe="producer.py", adios_xml_file='adios2.xml', sleep_after=5)), ("mean_calc", dict(exe="mean_calculator.py", adios_xml_file='adios2.xml', runner_override=False)) ] # CAMPAIGN SETTINGS #------------------ # A list of machines that this campaign is supported on supported_machines = ['local', 'titan', 'theta', 'summit', 'rhea', 'deepthought2_cpu', 'sdg_tm76'] # Option to kill an experiment (just one experiment, not the full sweep or campaign) if one of the codes fails kill_on_partial_failure = True # Some pre-processing in the experiment directory # This is performed when the campaign directory is created (before the campaign is launched) run_dir_setup_script = None # A post-processing script to be run in the experiment directory after the experiment completes # For example, removing some large files after the experiment is done run_post_process_script = None # umask applied to your directory in the campaign so that colleagues can view files umask = '027' # Scheduler information: job queue, account-id etc. Leave it to None if running on a local machine scheduler_options = {'theta': {'project': '', 'queue': 'batch'}, 'summit': {'project':'csc143','reservation':'csc143_m414'}, 'rhea': {'project':'csc143'}} # Setup your environment. Loading modules, setting the LD_LIBRARY_PATH etc. # Ensure this script is executable app_config_scripts = {'local': 'setup.sh', 'summit': 'env_setup.sh'} # PARAMETER SWEEPS #----------------- # Setup how the workflow is run, and what values to 'sweep' over # Use ParamCmdLineArg to setup a command line arg, ParamCmdLineOption to setup a command line option, and so on. sweep1_parameters = [ p.ParamRunner ('producer', 'nprocs', [2]), p.ParamRunner ('mean_calc', 'nprocs', [2]), p.ParamCmdLineArg ('producer', 'array_size_per_pe', 1, [1024*1024,]), # 1M, 2M, 10M p.ParamCmdLineArg ('producer', 'num_steps', 2, [10]), p.ParamADIOS2XML ('producer', 'engine_sst', 'producer', 'engine', [ {"SST": {}} ]), # p.ParamADIOS2XML ('producer', 'compression', 'producer', 'var_operation', [ {"U": {"zfp":{'accuracy':0.001, 'tolerance':0.9}}} ]), ] # Summit node layout # Create a shared node layout where the producer and mean_calc share compute nodes shared_node_nc = SummitNode() # place producer on the first socket for i in range(21): shared_node_nc.cpu[i] = 'producer:{}'.format(i) # place analysis on the second socket for i in range(8): shared_node_nc.cpu[22+i] = 'mean_calc:{}'.format(i) # This should be 'obj=machine.VirtualNode()' shared_node_dt = DTH2CPUNode() for i in range(10): shared_node_dt.cpu[i] = 'producer:{}'.format(i) shared_node_dt.cpu[11] = 'mean_calc:0' shared_node_dt.cpu[12] = 'mean_calc:1' # Create a sweep # node_layout represents no. of processes per node sweep1 = p.Sweep (node_layout = {'summit': [shared_node_nc], 'deepthought2_cpu': [shared_node_dt]}, # simulation: 16 ppn, norm_calc: 4 ppn parameters = sweep1_parameters, rc_dependency=None) # Create a sweep group from the above sweep. You can place multiple sweeps in the group. # Each group is submitted as a separate job. sweepGroup1 = p.SweepGroup ("sg-1", walltime=300, per_run_timeout=60, parameter_groups=[sweep1], launch_mode='default', # or MPMD # optional: tau_profiling=True, tau_tracing=False, # nodes=10, # tau_profiling=True, # tau_tracing=False, # run_repetitions=2, <-- repeat each experiment this many times # component_subdirs = True, <-- codes have their own separate workspace in the experiment directory # component_inputs = {'simulation': ['some_input_file'], 'norm_calc': [SymLink('some_large_file')] } <-- inputs required by codes # max_procs = 64 <-- max no. of procs to run concurrently. depends on 'nodes' ) # Create a sweep group from the above sweep. You can place multiple sweeps in the group. # Each group is submitted as a separate job. sweepGroup2 = p.SweepGroup ("sg-2", walltime=300, per_run_timeout=60, parameter_groups=[sweep1], launch_mode='default', # or MPMD # optional: tau_profiling=True, tau_tracing=False, # nodes=10, # tau_profiling=True, # tau_tracing=False, # run_repetitions=2, <-- repeat each experiment this many times # component_subdirs = True, <-- codes have their own separate workspace in the experiment directory # component_inputs = {'simulation': ['some_input_file'], 'norm_calc': [SymLink('some_large_file')] } <-- inputs required by codes # max_procs = 64 <-- max no. of procs to run concurrently. depends on 'nodes' ) # Sweep groups to be activated sweeps = {'MACHINE_ANY':[sweepGroup1], 'summit':[sweepGroup2]}
class ProducerConsumer(Campaign): # A name for the campaign name = "coupling-example" # WORKFLOW SETUP #--------------- # A list of the codes that will be part of the workflow # If there is an adios xml file associated with the codes, list it here # 'sleep_after' represents the time gap after which the next code is spawned # Use runner_override to run the code without the default launcher (mpirun/aprun/jsrun etc.). This runs the # code as a serial application codes = [ ("producer", dict(exe="program/producer.py", adios_xml_file='adios2.xml', sleep_after=5)), ] # CAMPAIGN SETTINGS #------------------ # A list of machines that this campaign is supported on supported_machines = [ 'local', 'titan', 'theta', 'summit', 'deepthought2_cpu', 'sdg_tm76' ] # Option to kill an experiment (just one experiment, not the full sweep or campaign) if one of the codes fails kill_on_partial_failure = True # Some pre-processing in the experiment directory # This is performed when the campaign directory is created (before the campaign is launched) run_dir_setup_script = None # A post-processing script to be run in the experiment directory after the experiment completes # For example, removing some large files after the experiment is done run_post_process_script = None # umask applied to your directory in the campaign so that colleagues can view files umask = '027' # Scheduler information: job queue, account-id etc. Leave it to None if running on a local machine scheduler_options = { "cori": { "queue": "debug", "constraint": "haswell", "license": "SCRATCH,project", }, "titan": { "queue": "debug", "project": "csc242", }, "theta": { "queue": "debug-flat-quad", "project": "CSC249ADCD01", }, "summit": { 'project': 'csc299' } } # Setup your environment. Loading modules, setting the LD_LIBRARY_PATH etc. # Ensure this script is executable # app_config_scripts = {'local': 'setup.sh', 'summit': 'env_setup.sh'} # PARAMETER SWEEPS #----------------- # Setup how the workflow is run, and what values to 'sweep' over # Use ParamCmdLineArg to setup a command line arg, ParamCmdLineOption to setup a command line option, and so on. sweep1_parameters = [ p.ParamRunner('producer', 'nprocs', [2]), p.ParamCmdLineArg('producer', 'array_size_per_pe', 1, [ 1024 * 1024, ]), # 1M, 2M, 10M p.ParamCmdLineArg('producer', 'num_steps', 2, [2]), #p.ParamADIOS2XML ('producer', 'engine_sst', 'producer', 'engine', [ {"BP4": {}} ]), ] node = SummitNode() node.cpu[0] = f"producer:0" node.cpu[1] = f"producer:1" node_layout = [node] # Create a sweep # node_layout represents no. of processes per node sweep1 = p.Sweep(node_layout={'summit': node_layout}, parameters=sweep1_parameters, rc_dependency=None) # Create a sweep group from the above sweep. You can place multiple sweeps in the group. # Each group is submitted as a separate job. sweepGroup1 = p.SweepGroup( "sg-1", walltime=300, per_run_timeout=60, parameter_groups=[sweep1], launch_mode='default' #, # or MPMD #tau_profiling=False, #tau_tracing=False, # optional: # nodes=10, # tau_profiling=True, # tau_tracing=False, # run_repetitions=2, # <-- repeat each experiment this many times # component_subdirs = True, <-- codes have their own separate workspace in the experiment directory # component_inputs = {'simulation': ['some_input_file'], 'norm_calc': [SymLink('some_large_file')] } <-- inputs required by codes # max_procs = 64 <-- max no. of procs to run concurrently. depends on 'nodes' ) # Sweep groups to be activated # sweeps = {'summit': [sweepGroup1]} sweeps = [sweepGroup1]
from codar.savanna.machines import SummitNode from sweep_groups_helper import create_sweep_groups # Parameters 4 nodes for a night run writer_np = [24] reader_np_ratio = [1] writers_per_node_summit = [6] size_per_pe = [] engines = ['bp4', 'hdf5'] run_repetitions = 4 batch_job_timeout_secs = 3600 per_experiment_timeout = 600 n = SummitNode() for i in range(writers_per_node_summit[0]): n.cpu[i] = "{}:{}".format("writer", i) n.cpu[i + writers_per_node_summit[0]] = "{}:{}".format("reader", i) node_layouts = [[n]] sweep_groups = create_sweep_groups('summit', writer_np, reader_np_ratio, size_per_pe, engines, node_layouts, run_repetitions, batch_job_timeout_secs, per_experiment_timeout)
class CalcECampaign(Campaign): """Example campaign for calc_e.py, that runs both methods with different precision and iteration counts. This could be used to explore the convergence rate of each method and the necessary decimal precision needed (and the cost of using the Decimal class with higher precision).""" # Used in job names submitted to scheduler. name = "e-small-one-node" # This application has a single executable, which we give the # friendly name 'pi' for later reference in parameter specification. # The executable path is taken relative to the application directory # specified on the cheetah command line. codes = [("calc_e", dict(exe="program/calc_e.py"))] # Document which machines the campaign is designed to run on. An # error will be raised if a different machine is specified on the # cheetah command line. supported_machines = ['local', 'cori', 'titan', 'theta', 'summit'] # Per machine scheduler options. Keys are the machine name, values # are dicts of name value pairs for the options for that machine. # Options must be explicitly supported by Cheetah, this is not # currently a generic mechanism. scheduler_options = { "cori": { "queue": "debug", "constraint": "haswell", "license": "SCRATCH,project", }, "titan": { "queue": "debug", "project": "csc242", }, "theta": { "queue": "debug-flat-quad", "project": "CSC249ADCD01", }, "summit": { 'project': 'CSC299' } } # Optionally set umask for campaign directory and all processes spawned by # the workflow script when the campaign is run. Note that user rx # must be allowed at a minimum. # If set must be a string suitable for passing to the umask command. umask = '027' node = SummitNode() node.cpu[0] = f"calc_e:0" node_layout = [node] # Define the range of command line arguments to pass to the calc_e.py # program in each of many runs. Within each Sweep, all possible # combinations will be generated and included in the campaign output # directory. Because the 'n' parameter has different meaning for the # two methods, we must define separate Sweep groups for each method # to avoid running 'factorial' with too many iterations. sweeps = [ # Sweep group defines a scheduler job. If different numbers of nodes # or node configurations are desired, then multiple SweepGroups can # be used. For most simple cases, only one is needed. p.SweepGroup( name="all-methods-small", nodes=1, walltime=timedelta(minutes=30), parameter_groups=[ p.Sweep( node_layout={'summit': node_layout}, parameters=[ p.ParamCmdLineArg("calc_e", "method", 1, ["pow"]), # use higher values of n for this method, since it's doing a single # exponentiation and not iterating like factorial p.ParamCmdLineArg("calc_e", "n", 2, [10, 100, 1000, 1000000, 10000000]), p.ParamCmdLineArg("calc_e", "precision", 3, [64, 128, 256, 512, 1024]), ]), p.Sweep( node_layout={'summit': node_layout}, parameters=[ p.ParamCmdLineArg("calc_e", "method", 1, ["factorial"]), p.ParamCmdLineArg("calc_e", "n", 2, [10, 100, 1000]), # explore higher precision values for this method p.ParamCmdLineArg( "calc_e", "precision", 3, [64, 128, 256, 512, 1024, 2048, 4096]), ]), ]), ]
def init(self, yamlfile): """ init() is what does the Cheetah-related setup. It doesn't require the user to write a Cheetah class file. It reads the Kittie config file, and then figures out how to subclass Cheetah. """ self.YAMLSetup() # Read in the config file with open(yamlfile, 'r') as ystream: self.config = yaml.load(ystream, Loader=self.OrderedLoader) # Kittie allows the user to set their own names for the fields in the config file if she wants. self._KeywordSetup() # Include other YAML files self._SetIfNotFound(self.config, 'include', [], level=logging.INFO) if len(self.config[self.keywords['include']]) > 0: try: self._MakeReplacements(include=True) except(KeyError): pass for filename in self.config[self.keywords['include']]: with open(filename, 'r') as ystream: self.config.update(yaml.load(ystream), Loader=self.OrderedLoader) # Make value replacements -- this when the user does things like processes-per-node: ${run.xgc.processes} self._MakeReplacements() # Set defaults if they're not found in the config file self._DefaultArgs() # Global Cheetah keywords self.output_dir = os.path.join(self.config[self.keywords['rundir']], self.cheetahdir) self.name = self.config[self.keywords['jobname']] # These are my own things, not Cheetah things per se, but are convenient to work with the Cheetah output self.mainpath = os.path.realpath(os.path.join(self.output_dir, self.cheetahsub)) self.machine = self.config['machine']['name'] machinekeys = self.config['machine'].keys() sweepargs = [] # Machine-based Cheetah options self.supported_machines = [self.machine] self.node_layout = {self.machine: []} self.scheduler_options = {self.machine: {}} if 'charge' in machinekeys: self.scheduler_options[self.machine]['project'] = self.config['machine']['charge'] if 'queue' in machinekeys: self.scheduler_options[self.machine]['queue'] = self.config['machine']['queue'] # Cheetah options that Setup the codes that will lanuch self.codes = [] if self.config[self.keywords['mpmd']]: self.launchmode = 'MPMD' subdirs = False else: self.launchmode = 'default' subdirs = True if self.config['machine'][self.keywords['script']] is not None: self.launchmode = 'default' subdirs = False self.stepinfo = {} lname = self.keywords['login-proc'] uselogin = False # Insert ADIOS-based names Scott wants for k, codename in enumerate(self.codenames): thisdir = os.path.dirname(os.path.realpath(__file__)) updir = os.path.dirname(thisdir) if (codename == "plot-colormap"): self.codesetup[codename][self.keywords['path']] = os.path.join(updir, "plot", "plotter-2d.py") if "only" in self.codesetup[codename]: self.codesetup[codename][self.keywords['args']] += [self.codesetup[codename]["only"]] self.codesetup[codename][self.keywords['options']]["only"] = self.codesetup[codename]["only"] elif "match-dimensions" in self.codesetup[codename]: self.codesetup[codename][self.keywords['args']] += [self.codesetup[codename]["match-dimensions"]] if "data" in self.codesetup[codename]: self.codesetup[codename]['.plotter'] = {'plots': self.codesetup[codename]["data"]} if 'colortype' in self.codesetup[codename]: self.codesetup[codename][self.keywords['options']]["colormap"] = self.codesetup[codename]["colortype"] if 'viewtype' in self.codesetup[codename]: self.codesetup[codename][self.keywords['options']]["type"] = self.codesetup[codename]["viewtype"] if ('use' in self.config[self.keywords['dashboard']]) and (self.config[self.keywords['dashboard']]['use']): self.codesetup[codename][self.keywords['options']]['use-dashboard'] = 'on' if not uselogin: self.config[lname] = self.config[self.keywords['dashboard']] uselogin = True if (codename == "plot-1D"): self.codesetup[codename][self.keywords['path']] = os.path.join(updir, "plot", "plotter-1d.py") if "x" in self.codesetup[codename]: self.codesetup[codename][self.keywords['args']] += [self.codesetup[codename]['x']] if "y" in self.codesetup[codename]: self.codesetup[codename][self.keywords['options']]['y'] = self.codesetup[codename]['y'] if "data" in self.codesetup[codename]: self.codesetup[codename]['.plotter'] = {'plots': self.codesetup[codename]["data"]} if ('use' in self.config[self.keywords['dashboard']]) and (self.config[self.keywords['dashboard']]['use']): self.codesetup[codename][self.keywords['options']]['use-dashboard'] = 'on' if not uselogin: self.config[lname] = self.config[self.keywords['dashboard']] uselogin = True for codename in self.codenames: StepGroup = codename + "-step" groupname = "." + StepGroup if groupname not in self.codesetup[codename].keys(): self.codesetup[codename][groupname] = {} self.codesetup[codename][groupname][self.keywords['engine']] = 'BP4' self.codesetup[codename][groupname][self.keywords['params']] = {} self.codesetup[codename][groupname][self.keywords['params']]["RendezvousReaderCount"] = 0 self.codesetup[codename][groupname][self.keywords['params']]["QueueLimit"] = 1 self.codesetup[codename][groupname][self.keywords['params']]["QueueFullPolicy"] = "Discard" self.timingdir = os.path.join(self.config[self.keywords['rundir']], 'effis-timing') for k, codename in enumerate(self.codenames): self.codesetup[codename]['groups'] = {} for key in self.codesetup[codename]: if key.startswith('.'): name = key[1:] entry = self.codesetup[codename][key] self.codesetup[codename]['groups'][name] = self.codesetup[codename][key] self.codesetup[codename]['groups'][name]['AddStep'] = False self.codesetup[codename]['groups'][name]['timingdir'] = self.timingdir # Insert ADIOS-based names Scott wants for k, codename in enumerate(self.codenames): for key in self.codesetup[codename]['groups']: entry = self.codesetup[codename]['groups'][key] if self.keywords['filename'] in entry: fname = entry[self.keywords['filename']] if not fname.startswith('/'): if self.launchmode != "MPMD": fname = os.path.join(self.mainpath, codename, fname) else: fname = os.path.join(self.mainpath, fname) self.codesetup[codename]['groups'][key]['filename'] = fname if self.keywords['engine'] in entry: self.codesetup[codename]['groups'][key]['engine'] = entry[self.keywords['engine']] if self.keywords['params'] in entry: self.codesetup[codename]['groups'][key]['params'] = entry[self.keywords['params']] # See if we're linking anything from other groups for k, codename in enumerate(self.codenames): for key in self.codesetup[codename]['groups']: entry = self.codesetup[codename]['groups'][key] if 'plots' in entry: if 'reads' not in entry: entry['reads'] = entry['plots'] else: entry['reads'] += entry['plots'] thisdir = os.path.dirname(os.path.realpath(__file__)) mfile = os.path.join(os.path.dirname(thisdir), "plot", "matplotlibrc") if mfile not in self.codesetup[codename][self.keywords['copy']]: self.codesetup[codename][self.keywords['copy']] += [mfile] if 'reads' in entry: code, group = entry['reads'].split('.', 1) other = self.codesetup[code]['groups'][group] if ('filename' not in other) and (('fromcode' not in other) or (not other['fromcode'])): raise ValueError("If you're going to read {0}.{1} you need to set it's filename when it writes".format(code, group)) elif 'filename' in other: self.codesetup[codename]['groups'][key]['filename'] = self.codesetup[code]['groups'][group]['filename'] if self.launchmode != "MPMD": self.codesetup[codename]['groups'][key]['stepfile'] = os.path.join(self.mainpath, code, code + '-step.bp') #self.codesetup[codename]['groups'][key]['filename'] = self.codesetup[code]['groups'][group]['filename'] if 'engine' in other: self.codesetup[codename]['groups'][key]['engine'] = self.codesetup[code]['groups'][group]['engine'] if 'params' in other: self.codesetup[codename]['groups'][key]['params'] = self.codesetup[code]['groups'][group]['params'] self.codesetup[code]['groups'][group]['AddStep'] = True #self.codesetup[codename]['groups'][key]['FromApp'] = code + if ('use' in self.config[lname]) and self.config[lname]['use']: uselogin = True self.codenames += [lname] self.codesetup[lname] = {} self.codesetup[lname][self.keywords['args']] = [] self.codesetup[lname]['scheduler_args'] = None self.codesetup[lname][self.keywords['options']] = {} self.codesetup[lname][self.keywords['path']] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "plot", "login.py") self.codesetup[lname]['processes'] = 1 self.codesetup[lname]['processes-per-node'] = 1 #self.codesetup[lname]['cpus-per-process'] = 1 self.codesetup[lname][self.keywords['copy']] = [] self.codesetup[lname][self.keywords['copycontents']] = [] self.codesetup[lname][self.keywords['link']] = [] self.codesetup[lname][self.keywords['file-edit']] = {} self.codesetup[lname]['groups'] = {} self.stepinfo['login'] = {} for name in ['shot_name', 'run_name', 'http']: if name not in self.config[lname]: msg = "{0} is required with {1} on. Exiting".format(msg, lname) self.logger.error(msg) sys.exit(1) self.stepinfo['login'][name] = self.config[lname][name] self.stepinfo['login']['username'] = getpass.getuser() self.stepinfo['login']['date'] = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S+%f') self.stepinfo['login']['machine_name'] = self.config['machine']['name'] for k, codename in enumerate(self.codenames): for j, groupname in enumerate(self.codesetup[codename]['groups']): if 'plots' in self.codesetup[codename]['groups'][groupname]: cname, gname = self.codesetup[codename]['groups'][groupname]['plots'].split('.', 1) name = "{0}-{1}-StepsDone.bp".format(codename, gname) if self.launchmode != "MPMD": name = os.path.join(self.mainpath, codename, name) else: name = os.path.join(self.mainpath, name) self.stepinfo['{0}.{1}'.format(cname, gname)] = name if "monitors" in self.codenames: self.codesetup["monitors"][self.keywords['path']] = os.path.join(updir, "bin", "kittie_monitor.py") self.monitors = {} self.monitors["monitors"] = self.codesetup["monitors"] self.monitors['groups'] = {} for cname in self.codenames: self.monitors['groups'][cname] = self.codesetup[cname]['groups'] SharedNodes = {} added = [] for k, codename in enumerate(self.codenames): codedict = {} codedict['exe'] = self.codesetup[codename][self.keywords['path']] # Added in node-layout branch if codename in [lname, "monitors"]: codedict['runner_override'] = True self.codes.append((codename, codedict)) self.codesetup[codename]['setup-file'] = os.path.join(os.path.dirname(self.codesetup[codename][self.keywords['path']]), ".kittie-setup.nml") # Set the number of processes sweepargs += [cheetah.parameters.ParamRunner(codename, "nprocs", [self.codesetup[codename]['processes']])] # Set the node layout -- namely, it's different on summit entry = self.codesetup[codename] ns = self.keywords['node-share'] sn = self.keywords['share-nodes'] cpp = 1 if 'cpus-per-process' in entry: cpp = entry['cpus-per-process'] if self.machine == 'summit': #if ns not in entry: if len(self.config[sn]) == 0: self.node_layout[self.machine] += [SummitNode()] added += [codename] index = -1 CPUstart = 0 else: found = False for group in self.config[sn]: cname = group[0] if (codename in group) and (cname in SharedNodes): found = True break if found: for i, name in enumerate(added): if name == cname: index = i break CPUstart = SharedNodes[cname] else: cname = codename index = -1 added += [codename] CPUstart = 0 self.node_layout[self.machine] += [SummitNode()] SharedNodes[cname] = CPUstart + entry['processes-per-node'] * cpp """ for cname in entry[ns]: print("codename: {0}, cname: {1}, SharedNodes: {2}".format(codename, cname, SharedNodes)) if cname in SharedNodes: found = True break if found: for i, name in enumerate(added): if name == cname: index = i break CPUstart = SharedNodes[cname] else: cname = codename index = -1 added += [codename] CPUstart = 0 self.node_layout[self.machine] += [SummitNode()] SharedNodes[cname] = CPUstart + entry['processes-per-node'] * cpp """ for i in range(entry['processes-per-node']): for j in range(cpp): self.node_layout[self.machine][index].cpu[CPUstart + i*cpp + j] = "{0}:{1}".format(codename, i) print(index, "{0}:{1}".format(codename, i), added) # This isn't exactly right yet if ('use-gpus' in entry) and entry['use-gpus']: self.node_layout[self.machine][index].gpu[i] = ["{0}:{1}".format(codename, i)] else: self.node_layout[self.machine] += [{codename: entry['processes-per-node']}] # Set the command line arguments args = self.codesetup[codename][self.keywords['args']] for i, arg in enumerate(args): sweepargs += [cheetah.parameters.ParamCmdLineArg(codename, "arg{0}".format(i+1), i+1, [arg])] # Set the command line options options = dict(self.codesetup[codename][self.keywords['options']]) for i, option in enumerate(options.keys()): sweepargs += [cheetah.parameters.ParamCmdLineOption(codename, "opt{0}".format(i), "--{0}".format(option), [options[option]])] if self.config['machine'][self.keywords['scheduler_args']] is not None: sweepargs += [cheetah.parameters.ParamSchedulerArgs(codename, [dict(self.config['machine'][self.keywords['scheduler_args']])])] if self.codesetup[codename][self.keywords['scheduler_args']] is not None: sweepargs += [cheetah.parameters.ParamSchedulerArgs(codename, [dict(self.codesetup[codename][self.keywords['scheduler_args']])])] exedir = os.path.dirname(self.codesetup[codename][self.keywords['path']]) sweepenv1 = cheetah.parameters.ParamEnvVar(codename, 'setup-file-yaml', 'KITTIE_YAML_FILE', [os.path.join(exedir, ".kittie-setup.yaml")]) sweepenv2 = cheetah.parameters.ParamEnvVar(codename, 'setup-file-nml', 'KITTIE_NML_FILE', [os.path.join(exedir, ".kittie-setup.nml" )]) sweepenv3 = cheetah.parameters.ParamEnvVar(codename, 'setup-file-num', 'KITTIE_NUM', ['{0}'.format(k)]) sweepargs += [sweepenv1, sweepenv2, sweepenv3] if uselogin and ('env' in self.config[lname]): for varname in self.config[lname]['env'].keys(): sweepenv = cheetah.parameters.ParamEnvVar(lname, lname + "-" + varname, varname, [self.config[lname]['env'][varname]]) sweepargs += [sweepenv] if ('ADIOS-serial' in self.config) and ('monitors' in self.codenames): sweepenv = cheetah.parameters.ParamEnvVar('monitors', "monitors-ADIOS", "ADIOS", [self.config['ADIOS-serial']]) sweepargs += [sweepenv] # A sweep encompasses is a set of parameters that can vary. In my case nothing is varying, and the only sweep paramter is a single number of processes sweep = cheetah.parameters.Sweep(sweepargs, node_layout=self.node_layout) # A sweepgroup runs a sweep by submiting a single job. There could be more than one sweepgroup, given by the sweeps list attribute, which would submit mutliple inpedent jobs. sweepgroup = cheetah.parameters.SweepGroup(self.groupname, walltime=self.config[self.keywords['walltime']], parameter_groups=[sweep], component_subdirs=subdirs, launch_mode=self.launchmode) self.sweeps = [sweepgroup] if self.config['machine']['job_setup'] is not None: self.app_config_scripts = {self.machine: os.path.realpath(self.config['machine']['job_setup'])} if self.config['machine']['submit_setup'] is not None: self.run_dir_setup_script = os.path.realpath(self.config['machine']['submit_setup'])