Esempio n. 1
0
def reset_states(config_file, sacred_id, filter_states, batch_id, filter_dict):
    collection = db_utils.get_collection_from_config(config_file)

    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(config_file, verbose=False)

        filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict)

        nreset = collection.count_documents(filter_dict)
        exps = collection.find(filter_dict)

        if nreset >= 10:
            if input(f"Resetting the state of {nreset} experiment{s_if(nreset)}. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            print(f"Resetting the state of {nreset} experiment{s_if(nreset)}.")
        for exp in exps:
            reset_experiment(collection, exp)
    else:
        exp = collection.find_one({'_id': sacred_id})
        if exp is None:
            raise LookupError(f"No experiment found with ID {sacred_id}.")
        else:
            print(f"Resetting the state of experiment with ID {sacred_id}.")
            reset_experiment(collection, exp)
Esempio n. 2
0
def delete_experiments(config_file, sacred_id, filter_states, batch_id,
                       filter_dict):
    collection = db_utils.get_collection_from_config(config_file)
    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(config_file, verbose=False)

        filter_dict = db_utils.build_filter_dict(filter_states, batch_id,
                                                 filter_dict)
        ndelete = collection.count_documents(filter_dict)

        if ndelete >= 10:
            if input(
                    f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
                    f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            print(
                f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection."
            )
        collection.delete_many(filter_dict)
    else:
        if collection.find_one({'_id': sacred_id}) is None:
            raise LookupError(f"No experiment found with ID {sacred_id}.")
        else:
            print(f"Deleting experiment with ID {sacred_id}.")
            collection.delete_one({'_id': sacred_id})
Esempio n. 3
0
def start_experiments(config_file, local, sacred_id, batch_id, filter_dict,
                      test, verbose):
    use_slurm = not local

    db_collection_name = db_utils.read_config(config_file)[0]['db_collection']

    if test != -1:
        verbose = True

    if sacred_id is None:
        filter_dict = db_utils.build_filter_dict([], batch_id, filter_dict)
    else:
        filter_dict = {'_id': sacred_id}

    do_work(db_collection_name,
            verbose,
            slurm=use_slurm,
            num_exps=test,
            filter_dict=filter_dict)
Esempio n. 4
0
def start_experiments(config_file, local, sacred_id, batch_id, filter_dict,
                      test, unobserved, post_mortem, debug, verbose, dry_run):
    use_slurm = not local

    db_collection_name = db_utils.read_config(config_file)[0]['db_collection']

    if debug:
        test = 1
        use_slurm = False
        unobserved = True
        post_mortem = True

    if test != -1:
        verbose = True

    if sacred_id is None:
        filter_dict = db_utils.build_filter_dict([], batch_id, filter_dict)
    else:
        filter_dict = {'_id': sacred_id}

    if dry_run:
        print_commands(db_collection_name,
                       log_verbose=verbose,
                       unobserved=unobserved,
                       post_mortem=post_mortem,
                       num_exps=test,
                       filter_dict=filter_dict)
    else:
        do_work(db_collection_name,
                log_verbose=verbose,
                slurm=use_slurm,
                unobserved=unobserved,
                post_mortem=post_mortem,
                num_exps=test,
                filter_dict=filter_dict,
                dry_run=dry_run)
Esempio n. 5
0
def cancel_experiments(config_file, sacred_id, filter_states, batch_id, filter_dict):
    """
    Cancel experiments.

    Parameters
    ----------
    config_file: str
        Path to the configuration YAML file.
    sacred_id: int or None
        ID of the experiment to cancel. If None, will use the other arguments to cancel possible multiple experiments.
    filter_states: list of strings or None
        List of statuses to filter for. Will cancel all jobs from the database collection
        with one of the given statuses.
    batch_id: int or None
        The ID of the batch of experiments to cancel. All experiments that are queued together (i.e. within the same
        command line call) have the same batch ID.
    filter_dict: dict or None
        Arbitrary filter dictionary to use for cancelling experiments. Any experiments whose database entries match all
        keys/values of the dictionary will be cancelled.

    Returns
    -------
    None

    """
    collection = db_utils.get_collection_from_config(config_file)
    if sacred_id is None:
        # no ID is provided: we check whether there are slurm jobs for which after this action no
        # RUNNING experiment remains. These slurm jobs can be killed altogether.
        # However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
        # running experiments.
        try:
            if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
                detect_killed(config_file, verbose=False)

            filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict)

            ncancel = collection.count_documents(filter_dict)
            if ncancel >= 10:
                if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
                         f"Are you sure? (y/n) ").lower() != "y":
                    exit()
            else:
                print(f"Cancelling {ncancel} experiment{s_if(ncancel)}.")

            exps = list(collection.find(filter_dict))
            # set of slurm IDs in the database
            slurm_ids = set([e['slurm']['id'] for e in exps if "slurm" in e and 'id' in e['slurm']])
            # set of experiment IDs to be cancelled.
            exp_ids = set([e['_id'] for e in exps])
            to_cancel = set()

            # iterate over slurm IDs to check which slurm jobs can be cancelled altogether
            for s_id in slurm_ids:
                # find experiments RUNNING under the slurm job
                jobs_running = list(collection.find({'slurm.id': s_id,
                                                     'status'  : {"$in": ["RUNNING"]}},
                                                    {"_id": 1}))
                running_exp_ids = set(e['_id'] for e in jobs_running)
                if len(running_exp_ids.difference(exp_ids)) == 0:
                    # there are no running jobs in this slurm job that should not be canceled.
                    to_cancel.add(str(s_id))

            # cancel all Slurm jobs for which no running experiment remains.
            if len(to_cancel) > 0:
                subprocess.check_output(f"scancel {' '.join(list(to_cancel))}", shell=True)

            # update database status and write the stop_time
            collection.update_many(filter_dict, {'$set': {"status": "INTERRUPTED",
                                                          "stop_time": datetime.datetime.utcnow()}})
        except subprocess.CalledProcessError:
            warnings.warn(f"One or multiple Slurm jobs were no longer running when I tried to cancel them.")
    else:
        print(f"Cancelling experiment with ID {sacred_id}.")
        cancel_experiment_by_id(collection, sacred_id)