Ejemplo n.º 1
0
def queue_experiments(config_file, force_duplicates):
    seml_config, slurm_config, experiment_config = db_utils.read_config(
        config_file)

    # Set Slurm config with default parameters as fall-back option
    default_slurm_config = get_default_slurm_config()
    for k, v in default_slurm_config['sbatch_options'].items():
        if k not in slurm_config['sbatch_options'].keys():
            slurm_config['sbatch_options'][k] = v
    del default_slurm_config['sbatch_options']
    for k, v in default_slurm_config.items():
        if k not in slurm_config.keys():
            slurm_config[k] = v

    slurm_config['sbatch_options'] = utils.remove_dashes(
        slurm_config['sbatch_options'])

    collection = db_utils.get_collection(seml_config['db_collection'])

    configs = generate_configs(experiment_config)

    if not force_duplicates:
        len_before = len(configs)
        configs = filter_experiments(collection, configs)
        len_after = len(configs)
        if len_after != len_before:
            print(
                f"{len_before - len_after} of {len_before} experiment{s_if(len_before)} were already found "
                f"in the database. They were not added again.")

    # Add the configurations to the database with QUEUED status.
    if len(configs) > 0:
        queue_configs(collection, seml_config, slurm_config, configs)
Ejemplo n.º 2
0
def queue_experiments(config_file, force_duplicates):
    tracking_config, _, experiment_config = db_utils.read_config(config_file)
    collection = db_utils.get_collection(tracking_config['db_collection'])

    configs = generate_configs(experiment_config)

    if not force_duplicates:
        len_before = len(configs)
        configs = filter_experiments(collection, configs)
        len_after = len(configs)
        if len_after != len_before:
            print(f"{len_before - len_after} of {len_before} experiment{s_if(len_before)} were already found "
                  f"in the database. They were not added again.")

    # Add the configurations to the database with QUEUED status.
    if len(configs) > 0:
        queue_configs(collection, tracking_config, configs)
Ejemplo n.º 3
0
def start_slurm_job(exps, log_verbose, output_dir=".", sbatch_options=None):
    """Run a list of experiments as a job on the Slurm cluster.

    Parameters
    ----------
    exps: List[dict]
        List of experiments to run.
    log_verbose: bool
        Print all the Python syscalls before running them.
    output_dir: str
        Directory (relative to home directory) where to store the slurm output files.
    sbatch_options: dict
        A dictionary that contains options for #SBATCH, e.g., {'--mem': 8000} to limit the job's memory to 8,000 MB.

    Returns
    -------
    None

    """
    id_strs = [str(exp['_id']) for exp in exps]
    job_name = f"{exps[0]['tracking']['db_collection']}_{','.join(id_strs)}"
    output_dir_path = os.path.abspath(os.path.expanduser(output_dir))
    if not os.path.isdir(output_dir_path):
        raise ValueError(
            f"Slurm output directory '{output_dir_path}' does not exist.")

    sbatch_dict = get_default_sbatch_dict()
    if sbatch_options is not None:
        sbatch_dict.update(sbatch_options)
    sbatch_dict['--job-name'] = job_name
    sbatch_dict['--output'] = f'{output_dir_path}/slurm-%j.out'

    script = "#!/bin/bash\n"

    for key, value in sbatch_dict.items():
        if key in ['--partition', 'p'] and isinstance(value, list):
            script += f"#SBATCH {key}={','.join(value)}\n"
        else:
            script += f"#SBATCH {key}={value}\n"

    script += "\n"
    script += "cd ${SLURM_SUBMIT_DIR} \n"
    script += "echo Starting job ${SLURM_JOBID} \n"
    script += "echo SLURM assigned me these nodes:\n"
    script += "squeue -j ${SLURM_JOBID} -O nodelist | tail -n +2\n"

    collection = db_utils.get_collection(exps[0]['tracking']['db_collection'])

    if "conda_environment" in exps[0]['tracking']:
        script += "CONDA_BASE=$(conda info --base)\n"
        script += "source $CONDA_BASE/etc/profile.d/conda.sh\n"
        script += f"conda activate {exps[0]['tracking']['conda_environment']}\n"

    check_file = check_cancelled.__file__
    script += "process_ids=() \n"
    script += f"exp_ids=({' '.join([str(e['_id']) for e in exps])}) \n"
    for ix, exp in enumerate(exps):
        cmd = get_cmd_from_exp_dict(exp)
        collection_str = exp['tracking']['db_collection']
        script += f"python {check_file} --experiment_id {exp['_id']} --database_collection {collection_str}\n"
        script += "ret=$?\n"
        script += "if [ $ret -eq 0 ]\n"
        script += "then\n"
        script += f"    {cmd}  & \n"
        script += f"    process_ids[{ix}]=$!\n"

        script += "elif [ $ret -eq 1 ]\n"
        script += "then\n"
        script += f"    echo WARNING: Experiment with ID {exp['_id']} has status INTERRUPTED and will not be run. \n"
        script += "elif [ $ret -eq 2 ]\n"
        script += "then\n"
        script += f"    (>&2 echo ERROR: Experiment with id {exp['_id']} not found in the database.)\n"
        script += "fi\n"

        collection.update_one({'_id': exp['_id']},
                              {'$set': {
                                  'status': 'PENDING'
                              }})
        collection.update_one({'_id': exp['_id']}, {
            '$set': {
                'slurm': dict(sbatch_options=sbatch_options, step_id=ix)
            }
        })

        if log_verbose:
            print(f'Running the following command:\n {cmd}')

    script += f"echo Experiments are running under the following process IDs:\n"
    script += f"num_it=${{#process_ids[@]}}\n"
    script += f"for ((i=0; i<$num_it; i++))\n"
    script += f"do\n"
    script += f"    echo \"Experiment ID: ${{exp_ids[$i]}}\tProcess ID: ${{process_ids[$i]}}\"\n"
    script += f"done\n"
    script += f"wait \n"

    random_int = np.random.randint(0, 999999)
    path = f"/tmp/{random_int}.sh"
    while os.path.exists(path):
        random_int = np.random.randint(0, 999999)
        path = f"/tmp/{random_int}.sh"
    with open(path, "w") as f:
        f.write(script)

    output = subprocess.check_output(f'sbatch {path}', shell=True)
    os.remove(path)
    slurm_job_id = int(output.split(b' ')[-1])
    for exp in exps:
        collection.update_one({'_id': exp['_id']}, {
            '$set': {
                'slurm.id': slurm_job_id,
                'slurm.output_file':
                f"{output_dir_path}/slurm-{slurm_job_id}.out"
            }
        })
        if log_verbose:
            print(f"Started experiment with ID {slurm_job_id}")
Ejemplo n.º 4
0
def do_work(collection_name,
            log_verbose,
            slurm=True,
            num_exps=-1,
            slurm_config=None,
            filter_dict=None):
    """Pull queued experiments from the database and run them.

    Parameters
    ----------
    collection_name: str
        Name of the collection in the MongoDB.
    log_verbose: bool
        Print all the Python syscalls before running them.
    slurm: bool
        Use the Slurm cluster.
    num_exps: int, default: -1
        If >0, will only submit the specified number of experiments to the cluster.
        This is useful when you only want to test your setup.
    slurm_config: dict
        Settings for the Slurm job. See `start_slurm_job` for details.
    filter_dict: dict
        Dictionary for filtering the entries in the collection.

    Returns
    -------
    None
    """

    if slurm_config is None:
        # Default Slurm config.
        slurm_config = {'output_dir': '.', 'experiments_per_job': 1}
    if filter_dict is None:
        filter_dict = {}

    collection = db_utils.get_collection(collection_name)

    query_dict = {'status': {"$in": ['QUEUED']}}
    query_dict.update(filter_dict)

    if collection.count_documents(query_dict) <= 0:
        print("No queued experiments.")
        return

    exps_list = list(collection.find(query_dict))

    # divide experiments into chunks of <experiments_per_job>  that will be run in parallel on one GPU.
    def chunk_list(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    nexps = num_exps if num_exps > 0 else len(exps_list)
    exp_chunks = chunk_list(exps_list[:nexps],
                            size=slurm_config['experiments_per_job'])

    njobs = math.ceil(nexps / slurm_config['experiments_per_job'])
    del slurm_config['experiments_per_job']

    if slurm:
        print(f"Starting {nexps} experiment{s_if(nexps)} in "
              f"{njobs} Slurm job{s_if(njobs)}.")
    else:
        print(f"Starting {nexps} experiment{s_if(nexps)} locally.")
        for exp in exps_list[:nexps]:
            collection.update_one({'_id': exp['_id']},
                                  {'$set': {
                                      'status': 'PENDING'
                                  }})

    for ix, exps in tqdm(enumerate(exp_chunks), total=njobs):
        if slurm:
            start_slurm_job(exps, log_verbose, **slurm_config)
        else:
            if 'fileserver' in os.uname()[1]:
                raise ValueError(
                    "Refusing to run a compute experiment on a file server. "
                    "Please use a GPU machine or slurm.")
            for exp in exps:
                cmd = get_cmd_from_exp_dict(exp)
                if log_verbose:
                    print(f'Running the following command:\n {cmd}')
                # pdb works with check_call but not with check_output. Maybe because of stdout/stdin.
                subprocess.check_call(cmd, shell=True)
Ejemplo n.º 5
0
from seml import database_utils as db_utils

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        "Check whether the experiment with given ID has been cancelled before its start.",
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("--experiment_id",
                        type=int,
                        help="The experiment ID to check in the database.")
    parser.add_argument("--database_collection",
                        type=str,
                        help="The collection in the database to use.")
    args = parser.parse_args()

    exp_id = args.experiment_id
    db_collection = args.database_collection

    mongodb_config = db_utils.get_mongodb_config()
    collection = db_utils.get_collection(db_collection, mongodb_config)

    exp = collection.find_one({'_id': exp_id})

    if exp is None:
        exit(2)
    if exp['status'] not in ["QUEUED", "PENDING"]:
        exit(1)
    else:
        exit(0)
Ejemplo n.º 6
0
def do_work(collection_name,
            log_verbose,
            slurm=True,
            unobserved=False,
            post_mortem=False,
            num_exps=-1,
            filter_dict={},
            dry_run=False):
    """Pull queued experiments from the database and run them.

    Parameters
    ----------
    collection_name: str
        Name of the collection in the MongoDB.
    log_verbose: bool
        Print all the Python syscalls before running them.
    slurm: bool
        Use the Slurm cluster.
    unobserved: bool
        Disable all Sacred observers (nothing written to MongoDB).
    post_mortem: bool
        Activate post-mortem debugging.
    num_exps: int, default: -1
        If >0, will only submit the specified number of experiments to the cluster.
        This is useful when you only want to test your setup.
    filter_dict: dict
        Dictionary for filtering the entries in the collection.
    dry_run: bool
        Just return the executables and configurations instead of running them.

    Returns
    -------
    None
    """

    collection = db_utils.get_collection(collection_name)

    query_dict = {'status': {"$in": ['QUEUED']}}
    query_dict.update(filter_dict)

    if collection.count_documents(query_dict) <= 0:
        print("No queued experiments.")
        return

    exps_list = list(collection.find(query_dict))

    nexps = num_exps if num_exps > 0 else len(exps_list)
    exp_chunks = db_utils.chunk_list(exps_list[:nexps])
    njobs = len(exp_chunks)

    if dry_run:
        configs = []
        for exps in exp_chunks:
            for exp in exps:
                configs.append(
                    get_config_from_exp(exp,
                                        log_verbose=log_verbose,
                                        unobserved=unobserved,
                                        post_mortem=post_mortem))
        return configs
    elif slurm:
        print(f"Starting {nexps} experiment{s_if(nexps)} in "
              f"{njobs} Slurm job{s_if(njobs)}.")

        for exps in tqdm(exp_chunks):
            slurm_config = exps[0]['slurm']
            seml_config = exps[0]['seml']
            if 'output_dir' in slurm_config:
                warnings.warn(
                    "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly"
                    "by moving the 'output_dir' parameter from 'slurm' to 'seml'."
                )
            elif 'output_dir' in seml_config:
                slurm_config['output_dir'] = seml_config['output_dir']
            del slurm_config['experiments_per_job']
            start_slurm_job(collection, exps, log_verbose, unobserved,
                            post_mortem, **slurm_config)
    else:
        login_node_name = 'fs'
        if login_node_name in os.uname()[1]:
            raise ValueError(
                "Refusing to run a compute experiment on a login node. "
                "Please use Slurm or a compute node.")

        print(
            f'Starting local worker thread that will run up to {nexps} experiments, '
            f'until no queued experiments remain.')
        collection.update_many(query_dict, {"$set": {"status": "PENDING"}})
        num_exceptions = 0
        i_exp = 0

        tq = tqdm(exp_chunks)
        for exps in tq:
            for exp in exps:
                exe, config = get_config_from_exp(exp,
                                                  log_verbose=log_verbose,
                                                  unobserved=unobserved,
                                                  post_mortem=post_mortem)

                cmd = f"python {exe} with {' '.join(config)}"

                if not unobserved:
                    # check also whether PENDING experiments have their Slurm ID set, in this case they are waiting
                    # for Slurm execution and we don't start them locally.
                    db_entry = collection.find_one_and_update(
                        filter={
                            '_id': exp['_id'],
                            'status': 'PENDING',
                            'slurm.id': {
                                '$exists': False
                            }
                        },
                        update={
                            '$set': {
                                'seml.command': cmd,
                                'status': 'RUNNING'
                            }
                        },
                        upsert=False)
                    if db_entry is None:
                        # another worker already set this entry to PENDING (or at least, it's no longer QUEUED)
                        # so we ignore it.
                        continue

                if log_verbose:
                    print(f'Running the following command:\n {cmd}')
                try:
                    output_dir = "."
                    slurm_config = exps[0]['slurm']
                    seml_config = exps[0]['seml']
                    if 'output_dir' in slurm_config:
                        warnings.warn(
                            "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly"
                            "by moving the 'output_dir' parameter from 'slurm' to 'seml'."
                        )
                        output_dir = slurm_config['output_dir']
                    if 'output_dir' in seml_config:
                        output_dir = seml_config['output_dir']
                    output_dir_path = os.path.abspath(
                        os.path.expanduser(output_dir))
                    exp_name = slurm_config['name']

                    output_file = f"{output_dir_path}/{exp_name}_{exp['_id']}-out.txt"
                    collection.find_and_modify(
                        {'_id': exp['_id']},
                        {"$set": {
                            "seml.output_file": output_file
                        }})

                    with open(output_file, "w") as log_file:
                        # pdb works with check_call but not with check_output. Maybe because of stdout/stdin.
                        subprocess.check_call(
                            cmd,
                            shell=True,
                            stderr=log_file,
                            stdout=log_file,
                        )

                except subprocess.CalledProcessError as e:
                    num_exceptions += 1
                except IOError:
                    print(f"Log file {output_file} could not be written.")
                    # Since Sacred is never called in case of I/O error, we need to set the experiment state manually.
                    collection.find_one_and_update(
                        filter={'_id': exp['_id']},
                        update={'$set': {
                            'status': 'FAILED'
                        }},
                        upsert=False)
                finally:
                    i_exp += 1
                    tq.set_postfix(
                        failed=f"{num_exceptions}/{i_exp} experiments")
Ejemplo n.º 7
0
def do_work(collection_name,
            log_verbose,
            slurm=True,
            num_exps=-1,
            filter_dict={}):
    """Pull queued experiments from the database and run them.

    Parameters
    ----------
    collection_name: str
        Name of the collection in the MongoDB.
    log_verbose: bool
        Print all the Python syscalls before running them.
    slurm: bool
        Use the Slurm cluster.
    num_exps: int, default: -1
        If >0, will only submit the specified number of experiments to the cluster.
        This is useful when you only want to test your setup.
    filter_dict: dict
        Dictionary for filtering the entries in the collection.

    Returns
    -------
    None
    """

    collection = db_utils.get_collection(collection_name)

    query_dict = {'status': {"$in": ['QUEUED']}}
    query_dict.update(filter_dict)

    if collection.count_documents(query_dict) <= 0:
        print("No queued experiments.")
        return

    exps_list = list(collection.find(query_dict))

    nexps = num_exps if num_exps > 0 else len(exps_list)
    exp_chunks = db_utils.chunk_list(exps_list[:nexps])
    njobs = len(exp_chunks)

    if slurm:
        print(f"Starting {nexps} experiment{s_if(nexps)} in "
              f"{njobs} Slurm job{s_if(njobs)}.")

        for exps in tqdm(exp_chunks):
            slurm_config = exps[0]['slurm']
            del slurm_config['experiments_per_job']
            start_slurm_job(collection, exps, log_verbose, **slurm_config)
    else:
        login_node_name = 'fs'
        if login_node_name in os.uname()[1]:
            raise ValueError(
                "Refusing to run a compute experiment on a login node. "
                "Please use Slurm or a compute node.")

        print(f"Starting {nexps} experiment{s_if(nexps)} locally.")
        for exp in exps_list[:nexps]:
            collection.update_one({'_id': exp['_id']},
                                  {'$set': {
                                      'status': 'PENDING'
                                  }})

        for exps in tqdm(exp_chunks):
            for exp in exps:
                cmd = get_cmd_from_exp_dict(exp)
                if log_verbose:
                    print(f'Running the following command:\n {cmd}')
                # pdb works with check_call but not with check_output. Maybe because of stdout/stdin.
                subprocess.check_call(cmd, shell=True)