def scoop_supervisor(hostfile, args): """Function used by the scoop supervisor to get the information needed to form the scoop call to run a scoop version of the program """ import os import stat import sys from metawards.utils import Console Console.print("RUNNING A SCOOP PROGRAM") cores_per_node = get_cores_per_node(args) Console.print( f"Will run jobs assuming {cores_per_node} cores per compute node") # based on the number of threads requested and the number of cores # per node, we can work out the number of scoop processes to start, # and can write a hostfile that will create the right layout nthreads = get_threads_per_task(args) Console.print(f"Will use {nthreads} OpenMP threads per model run...") tasks_per_node = int(cores_per_node / nthreads) Console.print(f"...meaning that the number of model runs per node will be " f"{tasks_per_node}") # Next, read the hostfile to get a unique list of hostnames hostnames = {} with open(hostfile, "r") as FILE: line = FILE.readline() while line: hostname = line.strip() if len(hostname) > 0: hostnames[hostname] = 1 line = FILE.readline() hostnames = list(hostnames.keys()) hostnames.sort() Console.print(f"Number of compute nodes equals {len(hostnames)}") Console.print(", ".join(hostnames)) # how many tasks can we perform in parallel? nprocs = tasks_per_node * len(hostnames) if args.nprocs: if nprocs != args.nprocs: Console.warning( f"You are using a not-recommended number of " f"processes {args.nprocs} for the cluster {nprocs}.") nprocs = args.nprocs Console.print( f"Total number of parallel processes to run will be {nprocs}") Console.print(f"Total number of cores in use will be {nprocs*nthreads}") # Now write a new hostfile that round-robins the MPI tasks over # the nodes for 'tasks_per_node' runs hostfile = f"_metawards_hostfile_{os.getpid()}" Console.print(f"Writing hostfile to {hostfile}") with open(hostfile, "w") as FILE: i = 0 while i < nprocs: for hostname in hostnames: FILE.write(hostname + "\n") i += 1 if i == nprocs: break # now craft the scoop command that will use this hostfile to # run the job - remember to pass the option to stop the main process # attempting to become a supervisor itself... import subprocess import shlex pyexe = sys.executable script = os.path.abspath(sys.argv[0]) args = " ".join(sys.argv[1:]) # also need to tell the main program the number of processes # as it can't work it out itself cmd = f"{pyexe} -m scoop --hostfile {hostfile} -n {nprocs} " \ f"{script} --already-supervised {args} --nprocs {nprocs}" Console.print("Executing scoop job using") Console.command(cmd) try: args = shlex.split(cmd) subprocess.run(args).check_returncode() except Exception as e: Console.error("ERROR: Something went wrong!") Console.error(f"{e.__class__}: {e}") sys.exit(-1) # clean up the hostfile afterwards... (we leave it if something # went wrong as it may help debugging) os.unlink(hostfile) Console.print("Scoop processes completed successfully")
def cli(): """Main function for the command line interface. This does one of three things: 1. If this is the main process, then it parses the arguments and runs and manages the jobs 2. If this is a worker process, then it starts up and waits for work 3. If this is a supervisor process, then it query the job scheduling system for information about the compute nodes to use, and will then set up and run a manager (main) process that will use those nodes to run the jobs """ from metawards.utils import Console # get the parallel scheme now before we import any other modules # so that it is clear if mpi4py or scoop (or another parallel module) # has been imported via the required "-m module" syntax parallel_scheme = get_parallel_scheme() if parallel_scheme == "mpi4py": from mpi4py import MPI comm = MPI.COMM_WORLD nprocs = comm.Get_size() rank = comm.Get_rank() if rank != 0: # this is a worker process, so should not do anything # more until it is given work in the pool Console.print(f"Starting worker process {rank+1} of {nprocs-1}...") return else: Console.print("Starting main process...") elif parallel_scheme == "scoop": Console.print("STARTING SCOOP PROCESS") import sys args, parser = parse_args() if not args.already_supervised: hostfile = get_hostfile(args) if hostfile: # The user has asked to run a parallel job - this means that this # process is the parallel supervisor if args.mpi: mpi_supervisor(hostfile, args) return elif args.scoop: scoop_supervisor(hostfile, args) return # neither is preferred - if scoop is installed then use that try: import scoop # noqa - disable unused warning have_scoop = True except Exception: have_scoop = False if have_scoop: scoop_supervisor(hostfile, args) return # do we have MPI? try: import mpi4py # noqa - disable unused warning have_mpi4py = True except Exception: have_mpi4py = False if have_mpi4py: mpi_supervisor(hostfile, args) return # we don't have any other option, just keep going and # use multiprocessing - in this case we don't need a # supervisor and this is the main process # This is now the code for the main process # WE NEED ONE OF these listed options; should_run = False for arg in [ args.input, args.repeats, args.disease, args.additional, args.model, args.iterator, args.extractor, args.demographics, args.mixer, args.mover ]: if arg is not None: should_run = True break if not should_run: parser.print_help(sys.stdout) sys.exit(0) if args.repeats is None: args.repeats = [1] # import the parameters here to speed up the display of help from metawards import Parameters, Network, Population, print_version_string # print the version information first, so that there is enough # information to enable someone to reproduce this run print_version_string() Console.rule("Initialise") if args.input: # get the line numbers of the input file to read if args.line is None or len(args.line) == 0: linenums = None Console.print(f"* Using parameters from all lines of {args.input}", markdown=True) else: from metawards.utils import string_to_ints linenums = string_to_ints(args.line) if len(linenums) == 0: Console.error(f"You cannot read no lines from {args.input}?") sys.exit(-1) elif len(linenums) == 1: Console.print( f"* Using parameters from line {linenums[0]} of " f"{args.input}", markdown=True) else: Console.print( f"* Using parameters from lines {linenums} of " f"{args.input}", markdown=True) from metawards import VariableSets, VariableSet variables = VariableSets.read(filename=args.input, line_numbers=linenums) else: from metawards import VariableSets, VariableSet # create a VariableSets with one null VariableSet variables = VariableSets() variables.append(VariableSet()) nrepeats = args.repeats if nrepeats is None or len(nrepeats) < 1: nrepeats = [1] if len(nrepeats) > 1 and len(variables) != len(nrepeats): Console.error(f"The number of repeats {len(nrepeats)} must equal the " f"number of adjustable variable lines {len(variables)}") raise ValueError("Disagreement in the number of repeats and " "adjustable variables") # ensure that all repeats are >= 0 nrepeats = [0 if int(x) < 0 else int(x) for x in nrepeats] if sum(nrepeats) == 0: Console.error(f"The number of the number of repeats is 0. Are you " f"sure that you don't want to run anything?") raise ValueError("Cannot run nothing") if len(nrepeats) == 1 and nrepeats[0] == 1: Console.print("* Performing a single run of each set of parameters", markdown=True) elif len(nrepeats) == 1: Console.print( f"* Performing {nrepeats[0]} runs of each set of parameters", markdown=True) else: Console.print( f"* Performing {nrepeats} runs applied to the parameters", markdown=True) variables = variables.repeat(nrepeats) # working out the number of processes and threads... from metawards.utils import guess_num_threads_and_procs (nthreads, nprocs) = guess_num_threads_and_procs(njobs=len(variables), nthreads=args.nthreads, nprocs=args.nprocs, parallel_scheme=parallel_scheme) Console.print( f"\n* Number of threads to use for each model run is {nthreads}", markdown=True) if nprocs > 1: Console.print( f"* Number of processes used to parallelise model " f"runs is {nprocs}", markdown=True) Console.print( f"* Parallelisation will be achieved using {parallel_scheme}", markdown=True) # sort out the random number seed seed = args.seed if seed is None: import random seed = random.randint(10000, 99999999) if seed == 0: # this is a special mode that a developer can use to force # all jobs to use the same random number seed (15324) that # is used for comparing outputs. This should NEVER be used # for production code Console.warning("Using special mode to fix all random number" "seeds to 15324. DO NOT USE IN PRODUCTION!!!") else: Console.print(f"* Using random number seed {seed}", markdown=True) # get the starting day and date start_day = args.start_day if start_day < 0: raise ValueError(f"You cannot use a start day {start_day} that is " f"less than zero!") start_date = None if args.start_date: try: from dateparser import parse start_date = parse(args.start_date).date() except Exception: pass if start_date is None: from datetime import date try: start_date = date.fromisoformat(args.start_date) except Exception as e: raise ValueError(f"Cannot interpret a valid date from " f"'{args.start_date}'. Error is " f"{e.__class__} {e}") if start_date is None: from datetime import date start_date = date.today() Console.print(f"* Day zero is {start_date.strftime('%A %B %d %Y')}", markdown=True) if start_day != 0: from datetime import timedelta start_day_date = start_date + timedelta(days=start_day) Console.print(f"Starting on day {start_day}, which is " f"{start_day_date.strftime('%A %B %d %Y')}") else: start_day_date = start_date # now find the MetaWardsData repository as this will be needed # for the repeat command line too (repository, repository_version) = Parameters.get_repository(args.repository) Console.print(f"* Using MetaWardsData at {repository}", markdown=True) if repository_version["is_dirty"]: Console.warning("This repository is dirty, meaning that the data" "has not been committed to git. This may make " "this calculation very difficult to reproduce") # now work out the minimum command line needed to repeat this job args.seed = seed args.nprocs = nprocs args.nthreads = nthreads args.start_date = start_date.isoformat() args.repository = repository # also print the source of all inputs import configargparse Console.rule("Souce of inputs") p = configargparse.get_argument_parser("main") Console.print(p.format_values()) # print out the command used to repeat this job repeat_cmd = "metawards" for key, value in vars(args).items(): if value is not None: k = key.replace("_", "-") if isinstance(value, bool): if value: repeat_cmd += f" --{k}" elif isinstance(value, list): repeat_cmd += f" --{k}" for val in value: v = str(val) if " " in v: repeat_cmd += f" '{v}''" else: repeat_cmd += f" {v}" else: v = str(value) if " " in v: repeat_cmd += f" --{k} '{v}''" else: repeat_cmd += f" --{k} {v}" Console.rule("Repeating this run") Console.print("To repeat this job use the command;") Console.command(repeat_cmd) Console.print("Or alternatively use the config.yaml file that will be " "written to the output directory and use the command;") Console.command("metawards -c config.yaml") # load all of the parameters try: params = Parameters.load(parameters=args.parameters) except Exception as e: Console.warning( f"Unable to load parameter files. Make sure that you have " f"cloned the MetaWardsData repository and have set the " f"environment variable METAWARDSDATA to point to the " f"local directory containing the repository, e.g. the " f"default is $HOME/GitHub/MetaWardsData") raise e # should we profile the code? (default no as it prints a lot) profiler = None if args.no_profile: profiler = None elif args.profile: from metawards.utils import Profiler profiler = Profiler() # load the disease and starting-point input files Console.rule("Disease") if args.disease: params.set_disease(args.disease) else: params.set_disease("ncov") Console.rule("Model data") if args.model: params.set_input_files(args.model) else: params.set_input_files("2011Data") # load the user-defined custom parameters Console.rule("Custom parameters and seeds") if args.user_variables: custom = VariableSet.read(args.user_variables) Console.print(f"Adjusting variables to {custom}") custom.adjust(params) else: Console.print("Not adjusting any parameters...") # read the additional seeds if args.additional is None or len(args.additional) == 0: Console.print("Not using any additional seeds...") else: for additional in args.additional: Console.print(f"Loading additional seeds from {additional}") params.add_seeds(additional) # what to do with the 0 state? stage_0 = "R" if args.disable_star: Console.print("Disabling the * state. Stage 0 is the one and " "only E state.") stage_0 = "disable" elif args.star_is_E: Console.print("Setting the * state as an additional E state.") stage_0 = "E" else: Console.print("Setting the * state as an additional R state.") stage_0 = "R" params.stage_0 = stage_0 # extra parameters that are set params.UV = args.UV # set these extra parameters to 0 params.static_play_at_home = 0 params.play_to_work = 0 params.work_to_play = 0 params.daily_imports = 0.0 Console.rule("Parameters") Console.print(params, markdown=True) # the size of the starting population population = Population(initial=args.population, date=start_day_date, day=start_day) Console.rule("Building the network") network = Network.build(params=params, population=population, max_nodes=args.max_nodes, max_links=args.max_links, profiler=profiler) if args.demographics: from metawards import Demographics Console.rule("Specialising into demographics") demographics = Demographics.load(args.demographics) Console.print(demographics) network = network.specialise(demographics, profiler=profiler, nthreads=nthreads) Console.rule("Preparing to run") from metawards import OutputFiles from metawards.utils import run_models outdir = args.output if outdir is None: outdir = "output" if args.force_overwrite_output: prompt = None else: from metawards import input def prompt(x): return input(x, default="y") auto_bzip = True if args.auto_bzip: auto_bzip = True elif args.no_auto_bzip: auto_bzip = False if args.iterator: iterator = args.iterator else: iterator = None if args.extractor: extractor = args.extractor else: extractor = None if args.mixer: mixer = args.mixer else: mixer = None if args.mover: mover = args.mover else: mover = None with OutputFiles(outdir, force_empty=args.force_overwrite_output, auto_bzip=auto_bzip, prompt=prompt) as output_dir: # write the config file for this job to output/config.yaml Console.rule("Running the model") CONSOLE = output_dir.open("console.log") Console.save(CONSOLE) lines = [] max_keysize = None for key, value in vars(args).items(): if max_keysize is None: max_keysize = len(key) elif len(key) > max_keysize: max_keysize = len(key) for key, value in vars(args).items(): if value is not None: key = key.replace("_", "-") spaces = " " * (max_keysize - len(key)) if isinstance(value, bool): if value: lines.append(f"{key}:{spaces} true") else: lines.append(f"{key}:{spaces} false") elif isinstance(value, list): s_value = [str(x) for x in value] lines.append(f"{key}:{spaces} [ {', '.join(s_value)} ]") else: lines.append(f"{key}:{spaces} {value}") CONFIG = output_dir.open("config.yaml", auto_bzip=False) lines.sort(key=str.swapcase) CONFIG.write("\n".join(lines)) CONFIG.write("\n") CONFIG.flush() CONFIG.close() lines = None result = run_models(network=network, variables=variables, population=population, nprocs=nprocs, nthreads=nthreads, seed=seed, nsteps=args.nsteps, output_dir=output_dir, iterator=iterator, extractor=extractor, mixer=mixer, mover=mover, profiler=profiler, parallel_scheme=parallel_scheme) if result is None or len(result) == 0: Console.print("No output - end of run") return 0 Console.rule("End of the run", style="finish") Console.save(CONSOLE) return 0
def advance_initial_seeds(network, population, infections, profiler, rngs, **kwargs): # extract user parameters params = network.params # extract files name for initial seeding probabilities ward_seed_filename = params.user_params["ward_seed_filename"] age_seed_filename = params.user_params["age_seed_filename"] time_seed_filename = params.user_params["time_seed_filename"] # start profiler p = profiler.start("additional_seeds") # set up lookups or read from cache age_probs_ind, age_probs = read_age_file(age_seed_filename) ward_probs_ind, ward_probs_trust, ward_probs = read_seed_file( ward_seed_filename) time_seed_date, time_seed_trust, time_seed_count = read_time_file( time_seed_filename) # extract current date date = population.date # filter to extract number of seeds filter_time_seed = [i == date for i in time_seed_date] time_seed_count = time_seed_count[filter_time_seed] if len(time_seed_count) > 0: # loop over trusts time_seed_trust = time_seed_trust[filter_time_seed] for j in range(len(time_seed_trust)): # extract trust trust = time_seed_trust[j] # extract wards filter_wards = [i == trust for i in ward_probs_trust] tward_probs = ward_probs[filter_wards] tward_probs_ind = ward_probs_ind[filter_wards] # extract number of seeds nseeds = time_seed_count[j] # select seeds in age-classes at random according to initial probabilities age_seeds = np.random.multinomial(nseeds, age_probs) # run over each age demographic for demographic in range(len(age_seeds)): # check if any seeding done in demographic if age_seeds[demographic] > 0: # select seeds in wards at random according to initial probabilities seeds = np.random.multinomial(age_seeds[demographic], tward_probs) # now seed infections for i in range(len(seeds)): ward = tward_probs_ind[i] num = seeds[i] if num > 0: seed_network = network.subnets[demographic] seed_wards = seed_network.nodes seed_infections = infections.subinfs[ demographic].play try: ward = seed_network.get_node_index(ward) if seed_wards.play_suscept[ward] == 0: Console.warning( f"Cannot seed {num} infection(s) in ward {ward} " f"as there are no susceptibles remaining" ) continue elif seed_wards.play_suscept[ward] < num: Console.warning( f"Not enough susceptibles in ward to see all {num}" ) num = seed_wards.play_suscept[ward] seed_wards.play_suscept[ward] -= num if demographic is not None: Console.print( f"seeding demographic {demographic} " f"play_infections[0][{ward}] += {num}") else: Console.print( f"seeding play_infections[0][{ward}] += {num}" ) seed_infections[0][ward] += num except Exception as e: Console.error( f"Unable to seed the infection using {seed}. The " f"error was {e.__class__}: {e}. Please double-check " f"that you are trying to seed a node that exists " f"in this network.") raise e # end profiler p.stop()