Ejemplo n.º 1
0
def scoop_supervisor(hostfile, args):
    """Function used by the scoop supervisor to get the information needed to
       form the scoop call to run a scoop version of the program
    """
    import os
    import stat
    import sys
    from metawards.utils import Console

    Console.print("RUNNING A SCOOP PROGRAM")

    cores_per_node = get_cores_per_node(args)

    Console.print(
        f"Will run jobs assuming {cores_per_node} cores per compute node")

    # based on the number of threads requested and the number of cores
    # per node, we can work out the number of scoop processes to start,
    # and can write a hostfile that will create the right layout
    nthreads = get_threads_per_task(args)

    Console.print(f"Will use {nthreads} OpenMP threads per model run...")

    tasks_per_node = int(cores_per_node / nthreads)

    Console.print(f"...meaning that the number of model runs per node will be "
                  f"{tasks_per_node}")

    # Next, read the hostfile to get a unique list of hostnames
    hostnames = {}

    with open(hostfile, "r") as FILE:
        line = FILE.readline()
        while line:
            hostname = line.strip()
            if len(hostname) > 0:
                hostnames[hostname] = 1
            line = FILE.readline()

    hostnames = list(hostnames.keys())
    hostnames.sort()

    Console.print(f"Number of compute nodes equals {len(hostnames)}")
    Console.print(", ".join(hostnames))

    # how many tasks can we perform in parallel?
    nprocs = tasks_per_node * len(hostnames)

    if args.nprocs:
        if nprocs != args.nprocs:
            Console.warning(
                f"You are using a not-recommended number of "
                f"processes {args.nprocs} for the cluster {nprocs}.")

        nprocs = args.nprocs

    Console.print(
        f"Total number of parallel processes to run will be {nprocs}")
    Console.print(f"Total number of cores in use will be {nprocs*nthreads}")

    # Now write a new hostfile that round-robins the MPI tasks over
    # the nodes for 'tasks_per_node' runs
    hostfile = f"_metawards_hostfile_{os.getpid()}"
    Console.print(f"Writing hostfile to {hostfile}")

    with open(hostfile, "w") as FILE:
        i = 0
        while i < nprocs:
            for hostname in hostnames:
                FILE.write(hostname + "\n")
                i += 1

                if i == nprocs:
                    break

    # now craft the scoop command that will use this hostfile to
    # run the job - remember to pass the option to stop the main process
    # attempting to become a supervisor itself...

    import subprocess
    import shlex

    pyexe = sys.executable
    script = os.path.abspath(sys.argv[0])
    args = " ".join(sys.argv[1:])

    # also need to tell the main program the number of processes
    # as it can't work it out itself
    cmd = f"{pyexe} -m scoop --hostfile {hostfile} -n {nprocs} " \
          f"{script} --already-supervised {args} --nprocs {nprocs}"

    Console.print("Executing scoop job using")
    Console.command(cmd)

    try:
        args = shlex.split(cmd)
        subprocess.run(args).check_returncode()
    except Exception as e:
        Console.error("ERROR: Something went wrong!")
        Console.error(f"{e.__class__}: {e}")
        sys.exit(-1)

    # clean up the hostfile afterwards... (we leave it if something
    # went wrong as it may help debugging)
    os.unlink(hostfile)

    Console.print("Scoop processes completed successfully")
Ejemplo n.º 2
0
def cli():
    """Main function for the command line interface. This does one of three
       things:

       1. If this is the main process, then it parses the arguments and
          runs and manages the jobs

       2. If this is a worker process, then it starts up and waits for work

       3. If this is a supervisor process, then it query the job scheduling
          system for information about the compute nodes to use, and will then
          set up and run a manager (main) process that will use those
          nodes to run the jobs
    """
    from metawards.utils import Console

    # get the parallel scheme now before we import any other modules
    # so that it is clear if mpi4py or scoop (or another parallel module)
    # has been imported via the required "-m module" syntax
    parallel_scheme = get_parallel_scheme()

    if parallel_scheme == "mpi4py":
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        nprocs = comm.Get_size()
        rank = comm.Get_rank()

        if rank != 0:
            # this is a worker process, so should not do anything
            # more until it is given work in the pool
            Console.print(f"Starting worker process {rank+1} of {nprocs-1}...")
            return
        else:
            Console.print("Starting main process...")

    elif parallel_scheme == "scoop":
        Console.print("STARTING SCOOP PROCESS")

    import sys

    args, parser = parse_args()

    if not args.already_supervised:
        hostfile = get_hostfile(args)
        if hostfile:
            # The user has asked to run a parallel job - this means that this
            # process is the parallel supervisor
            if args.mpi:
                mpi_supervisor(hostfile, args)
                return
            elif args.scoop:
                scoop_supervisor(hostfile, args)
                return

            # neither is preferred - if scoop is installed then use that
            try:
                import scoop  # noqa - disable unused warning
                have_scoop = True
            except Exception:
                have_scoop = False

            if have_scoop:
                scoop_supervisor(hostfile, args)
                return

            # do we have MPI?
            try:
                import mpi4py  # noqa - disable unused warning
                have_mpi4py = True
            except Exception:
                have_mpi4py = False

            if have_mpi4py:
                mpi_supervisor(hostfile, args)
                return

            # we don't have any other option, just keep going and
            # use multiprocessing - in this case we don't need a
            # supervisor and this is the main process

    # This is now the code for the main process

    # WE NEED ONE OF these listed options;
    should_run = False

    for arg in [
            args.input, args.repeats, args.disease, args.additional,
            args.model, args.iterator, args.extractor, args.demographics,
            args.mixer, args.mover
    ]:
        if arg is not None:
            should_run = True
            break

    if not should_run:
        parser.print_help(sys.stdout)
        sys.exit(0)

    if args.repeats is None:
        args.repeats = [1]

    # import the parameters here to speed up the display of help
    from metawards import Parameters, Network, Population, print_version_string

    # print the version information first, so that there is enough
    # information to enable someone to reproduce this run
    print_version_string()

    Console.rule("Initialise")

    if args.input:
        # get the line numbers of the input file to read
        if args.line is None or len(args.line) == 0:
            linenums = None
            Console.print(f"* Using parameters from all lines of {args.input}",
                          markdown=True)
        else:
            from metawards.utils import string_to_ints
            linenums = string_to_ints(args.line)

            if len(linenums) == 0:
                Console.error(f"You cannot read no lines from {args.input}?")
                sys.exit(-1)
            elif len(linenums) == 1:
                Console.print(
                    f"* Using parameters from line {linenums[0]} of "
                    f"{args.input}",
                    markdown=True)
            else:
                Console.print(
                    f"* Using parameters from lines {linenums} of "
                    f"{args.input}",
                    markdown=True)

        from metawards import VariableSets, VariableSet
        variables = VariableSets.read(filename=args.input,
                                      line_numbers=linenums)
    else:
        from metawards import VariableSets, VariableSet
        # create a VariableSets with one null VariableSet
        variables = VariableSets()
        variables.append(VariableSet())

    nrepeats = args.repeats

    if nrepeats is None or len(nrepeats) < 1:
        nrepeats = [1]

    if len(nrepeats) > 1 and len(variables) != len(nrepeats):
        Console.error(f"The number of repeats {len(nrepeats)} must equal the "
                      f"number of adjustable variable lines {len(variables)}")
        raise ValueError("Disagreement in the number of repeats and "
                         "adjustable variables")

    # ensure that all repeats are >= 0
    nrepeats = [0 if int(x) < 0 else int(x) for x in nrepeats]

    if sum(nrepeats) == 0:
        Console.error(f"The number of the number of repeats is 0. Are you "
                      f"sure that you don't want to run anything?")
        raise ValueError("Cannot run nothing")

    if len(nrepeats) == 1 and nrepeats[0] == 1:
        Console.print("* Performing a single run of each set of parameters",
                      markdown=True)
    elif len(nrepeats) == 1:
        Console.print(
            f"* Performing {nrepeats[0]} runs of each set of parameters",
            markdown=True)
    else:
        Console.print(
            f"* Performing {nrepeats} runs applied to the parameters",
            markdown=True)

    variables = variables.repeat(nrepeats)

    # working out the number of processes and threads...
    from metawards.utils import guess_num_threads_and_procs
    (nthreads,
     nprocs) = guess_num_threads_and_procs(njobs=len(variables),
                                           nthreads=args.nthreads,
                                           nprocs=args.nprocs,
                                           parallel_scheme=parallel_scheme)

    Console.print(
        f"\n* Number of threads to use for each model run is {nthreads}",
        markdown=True)

    if nprocs > 1:
        Console.print(
            f"* Number of processes used to parallelise model "
            f"runs is {nprocs}",
            markdown=True)
        Console.print(
            f"* Parallelisation will be achieved using {parallel_scheme}",
            markdown=True)

    # sort out the random number seed
    seed = args.seed

    if seed is None:
        import random
        seed = random.randint(10000, 99999999)

    if seed == 0:
        # this is a special mode that a developer can use to force
        # all jobs to use the same random number seed (15324) that
        # is used for comparing outputs. This should NEVER be used
        # for production code
        Console.warning("Using special mode to fix all random number"
                        "seeds to 15324. DO NOT USE IN PRODUCTION!!!")
    else:
        Console.print(f"* Using random number seed {seed}", markdown=True)

    # get the starting day and date
    start_day = args.start_day

    if start_day < 0:
        raise ValueError(f"You cannot use a start day {start_day} that is "
                         f"less than zero!")

    start_date = None

    if args.start_date:
        try:
            from dateparser import parse
            start_date = parse(args.start_date).date()
        except Exception:
            pass

        if start_date is None:
            from datetime import date
            try:
                start_date = date.fromisoformat(args.start_date)
            except Exception as e:
                raise ValueError(f"Cannot interpret a valid date from "
                                 f"'{args.start_date}'. Error is "
                                 f"{e.__class__} {e}")

    if start_date is None:
        from datetime import date
        start_date = date.today()

    Console.print(f"* Day zero is {start_date.strftime('%A %B %d %Y')}",
                  markdown=True)

    if start_day != 0:
        from datetime import timedelta
        start_day_date = start_date + timedelta(days=start_day)
        Console.print(f"Starting on day {start_day}, which is "
                      f"{start_day_date.strftime('%A %B %d %Y')}")
    else:
        start_day_date = start_date

    # now find the MetaWardsData repository as this will be needed
    # for the repeat command line too
    (repository,
     repository_version) = Parameters.get_repository(args.repository)

    Console.print(f"* Using MetaWardsData at {repository}", markdown=True)

    if repository_version["is_dirty"]:
        Console.warning("This repository is dirty, meaning that the data"
                        "has not been committed to git. This may make "
                        "this calculation very difficult to reproduce")

    # now work out the minimum command line needed to repeat this job
    args.seed = seed
    args.nprocs = nprocs
    args.nthreads = nthreads
    args.start_date = start_date.isoformat()
    args.repository = repository

    # also print the source of all inputs
    import configargparse
    Console.rule("Souce of inputs")
    p = configargparse.get_argument_parser("main")
    Console.print(p.format_values())

    # print out the command used to repeat this job
    repeat_cmd = "metawards"

    for key, value in vars(args).items():
        if value is not None:
            k = key.replace("_", "-")

            if isinstance(value, bool):
                if value:
                    repeat_cmd += f" --{k}"
            elif isinstance(value, list):
                repeat_cmd += f" --{k}"
                for val in value:
                    v = str(val)
                    if " " in v:
                        repeat_cmd += f" '{v}''"
                    else:
                        repeat_cmd += f" {v}"
            else:
                v = str(value)
                if " " in v:
                    repeat_cmd += f" --{k} '{v}''"
                else:
                    repeat_cmd += f" --{k} {v}"

    Console.rule("Repeating this run")
    Console.print("To repeat this job use the command;")
    Console.command(repeat_cmd)
    Console.print("Or alternatively use the config.yaml file that will be "
                  "written to the output directory and use the command;")
    Console.command("metawards -c config.yaml")

    # load all of the parameters
    try:
        params = Parameters.load(parameters=args.parameters)
    except Exception as e:
        Console.warning(
            f"Unable to load parameter files. Make sure that you have "
            f"cloned the MetaWardsData repository and have set the "
            f"environment variable METAWARDSDATA to point to the "
            f"local directory containing the repository, e.g. the "
            f"default is $HOME/GitHub/MetaWardsData")
        raise e

    # should we profile the code? (default no as it prints a lot)
    profiler = None

    if args.no_profile:
        profiler = None
    elif args.profile:
        from metawards.utils import Profiler
        profiler = Profiler()

    # load the disease and starting-point input files
    Console.rule("Disease")
    if args.disease:
        params.set_disease(args.disease)
    else:
        params.set_disease("ncov")

    Console.rule("Model data")
    if args.model:
        params.set_input_files(args.model)
    else:
        params.set_input_files("2011Data")

    # load the user-defined custom parameters
    Console.rule("Custom parameters and seeds")
    if args.user_variables:
        custom = VariableSet.read(args.user_variables)
        Console.print(f"Adjusting variables to {custom}")
        custom.adjust(params)
    else:
        Console.print("Not adjusting any parameters...")

    # read the additional seeds
    if args.additional is None or len(args.additional) == 0:
        Console.print("Not using any additional seeds...")
    else:
        for additional in args.additional:
            Console.print(f"Loading additional seeds from {additional}")
            params.add_seeds(additional)

    # what to do with the 0 state?
    stage_0 = "R"

    if args.disable_star:
        Console.print("Disabling the * state. Stage 0 is the one and "
                      "only E state.")
        stage_0 = "disable"
    elif args.star_is_E:
        Console.print("Setting the * state as an additional E state.")
        stage_0 = "E"
    else:
        Console.print("Setting the * state as an additional R state.")
        stage_0 = "R"

    params.stage_0 = stage_0

    # extra parameters that are set
    params.UV = args.UV

    # set these extra parameters to 0
    params.static_play_at_home = 0
    params.play_to_work = 0
    params.work_to_play = 0
    params.daily_imports = 0.0

    Console.rule("Parameters")
    Console.print(params, markdown=True)

    # the size of the starting population
    population = Population(initial=args.population,
                            date=start_day_date,
                            day=start_day)

    Console.rule("Building the network")
    network = Network.build(params=params,
                            population=population,
                            max_nodes=args.max_nodes,
                            max_links=args.max_links,
                            profiler=profiler)

    if args.demographics:
        from metawards import Demographics
        Console.rule("Specialising into demographics")
        demographics = Demographics.load(args.demographics)
        Console.print(demographics)

        network = network.specialise(demographics,
                                     profiler=profiler,
                                     nthreads=nthreads)

    Console.rule("Preparing to run")
    from metawards import OutputFiles
    from metawards.utils import run_models

    outdir = args.output

    if outdir is None:
        outdir = "output"

    if args.force_overwrite_output:
        prompt = None
    else:
        from metawards import input

        def prompt(x):
            return input(x, default="y")

    auto_bzip = True

    if args.auto_bzip:
        auto_bzip = True
    elif args.no_auto_bzip:
        auto_bzip = False

    if args.iterator:
        iterator = args.iterator
    else:
        iterator = None

    if args.extractor:
        extractor = args.extractor
    else:
        extractor = None

    if args.mixer:
        mixer = args.mixer
    else:
        mixer = None

    if args.mover:
        mover = args.mover
    else:
        mover = None

    with OutputFiles(outdir,
                     force_empty=args.force_overwrite_output,
                     auto_bzip=auto_bzip,
                     prompt=prompt) as output_dir:
        # write the config file for this job to output/config.yaml
        Console.rule("Running the model")
        CONSOLE = output_dir.open("console.log")
        Console.save(CONSOLE)

        lines = []
        max_keysize = None

        for key, value in vars(args).items():
            if max_keysize is None:
                max_keysize = len(key)
            elif len(key) > max_keysize:
                max_keysize = len(key)

        for key, value in vars(args).items():
            if value is not None:
                key = key.replace("_", "-")
                spaces = " " * (max_keysize - len(key))

                if isinstance(value, bool):
                    if value:
                        lines.append(f"{key}:{spaces} true")
                    else:
                        lines.append(f"{key}:{spaces} false")
                elif isinstance(value, list):
                    s_value = [str(x) for x in value]
                    lines.append(f"{key}:{spaces} [ {', '.join(s_value)} ]")
                else:
                    lines.append(f"{key}:{spaces} {value}")

        CONFIG = output_dir.open("config.yaml", auto_bzip=False)
        lines.sort(key=str.swapcase)
        CONFIG.write("\n".join(lines))
        CONFIG.write("\n")
        CONFIG.flush()
        CONFIG.close()
        lines = None

        result = run_models(network=network,
                            variables=variables,
                            population=population,
                            nprocs=nprocs,
                            nthreads=nthreads,
                            seed=seed,
                            nsteps=args.nsteps,
                            output_dir=output_dir,
                            iterator=iterator,
                            extractor=extractor,
                            mixer=mixer,
                            mover=mover,
                            profiler=profiler,
                            parallel_scheme=parallel_scheme)

        if result is None or len(result) == 0:
            Console.print("No output - end of run")
            return 0

        Console.rule("End of the run", style="finish")

        Console.save(CONSOLE)

    return 0
Ejemplo n.º 3
0
def advance_initial_seeds(network, population, infections, profiler, rngs,
                          **kwargs):

    # extract user parameters
    params = network.params

    # extract files name for initial seeding probabilities
    ward_seed_filename = params.user_params["ward_seed_filename"]
    age_seed_filename = params.user_params["age_seed_filename"]
    time_seed_filename = params.user_params["time_seed_filename"]

    # start profiler
    p = profiler.start("additional_seeds")

    # set up lookups or read from cache
    age_probs_ind, age_probs = read_age_file(age_seed_filename)
    ward_probs_ind, ward_probs_trust, ward_probs = read_seed_file(
        ward_seed_filename)
    time_seed_date, time_seed_trust, time_seed_count = read_time_file(
        time_seed_filename)

    # extract current date
    date = population.date

    # filter to extract number of seeds
    filter_time_seed = [i == date for i in time_seed_date]
    time_seed_count = time_seed_count[filter_time_seed]

    if len(time_seed_count) > 0:

        # loop over trusts
        time_seed_trust = time_seed_trust[filter_time_seed]

        for j in range(len(time_seed_trust)):

            # extract trust
            trust = time_seed_trust[j]

            # extract wards
            filter_wards = [i == trust for i in ward_probs_trust]
            tward_probs = ward_probs[filter_wards]
            tward_probs_ind = ward_probs_ind[filter_wards]

            # extract number of seeds
            nseeds = time_seed_count[j]

            # select seeds in age-classes at random according to initial probabilities
            age_seeds = np.random.multinomial(nseeds, age_probs)

            # run over each age demographic
            for demographic in range(len(age_seeds)):

                # check if any seeding done in demographic
                if age_seeds[demographic] > 0:

                    # select seeds in wards at random according to initial probabilities
                    seeds = np.random.multinomial(age_seeds[demographic],
                                                  tward_probs)

                    # now seed infections
                    for i in range(len(seeds)):
                        ward = tward_probs_ind[i]
                        num = seeds[i]

                        if num > 0:
                            seed_network = network.subnets[demographic]
                            seed_wards = seed_network.nodes
                            seed_infections = infections.subinfs[
                                demographic].play

                            try:
                                ward = seed_network.get_node_index(ward)

                                if seed_wards.play_suscept[ward] == 0:
                                    Console.warning(
                                        f"Cannot seed {num} infection(s) in ward {ward} "
                                        f"as there are no susceptibles remaining"
                                    )
                                    continue

                                elif seed_wards.play_suscept[ward] < num:
                                    Console.warning(
                                        f"Not enough susceptibles in ward to see all {num}"
                                    )
                                    num = seed_wards.play_suscept[ward]

                                seed_wards.play_suscept[ward] -= num
                                if demographic is not None:
                                    Console.print(
                                        f"seeding demographic {demographic} "
                                        f"play_infections[0][{ward}] += {num}")
                                else:
                                    Console.print(
                                        f"seeding play_infections[0][{ward}] += {num}"
                                    )

                                seed_infections[0][ward] += num

                            except Exception as e:
                                Console.error(
                                    f"Unable to seed the infection using {seed}. The "
                                    f"error was {e.__class__}: {e}. Please double-check "
                                    f"that you are trying to seed a node that exists "
                                    f"in this network.")
                                raise e

    # end profiler
    p.stop()