Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        description = 'Simple example for using dask-joqueue in SLURM')

    parser.add_argument('--proc_per_job', type = int, default = 1,
                        help = 'Number of processes per job.')
    parser.add_argument('--cores_per_proc', type = float, default = 2,
                        help = 'Number of cores per process.')
    parser.add_argument('--n_jobs', type = int, default = 1,
                        help = 'Number of jobs')
    parser.add_argument('--array', type = int, default = 0,
                        help = 'EXPERIMENTAL. If >0, then submit an job-array '+\
                        'of this size. The total number of jobs will'+\
                        ' be `array * n_jobs`.')
    parser.add_argument('--container', type = str,
                        help = 'Path to singularity container. If `None`, '+\
                        'then assumes conda environment.')
    parser.add_argument('--qos', type = str, help = 'QOS to use.')
    parser.add_argument('--dry', action = 'store_true',
                        help = 'Print job script and exit (no submission)')
    parser.add_argument('--load', type = int, default = 1000,
                        help = 'Load for the function.')
    args = parser.parse_args()

    n_procs = args.proc_per_job * args.n_jobs

    params = {
        'cores' : int(args.cores_per_proc * args.proc_per_job),
        'memory' : '{0:d}00MB'.format(args.proc_per_job*5),
        'processes' : args.proc_per_job,
        # The name to assign to each worker
        'name' : 'dask_test'
    }

    job_extra = ['--requeue']
    env_extra = []

    if not args.qos is None:
        job_extra.append('--qos {}'.format(args.qos))

    if args.array > 0:
        n_procs = n_procs * args.array
        job_extra.append('--array 0-{0:d}'.format(args.array - 1))
        """
        This is added to ensure that each worker has a unique ID.
        This may be unnecessary.
        """
        env_extra.append(
            'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}')

    if not args.container is None:
        """
        When using a  container, dask needs to know how to enter the python
        environment.

        Note:
        The binding `-B..` is cluster(OpenMind) specific but can generalize.
        The binding is required since `singularity` will not bind by default.
        """
        cont = os.path.normpath(args.container)
        bind = cont.split(os.sep)[1]
        bind = '-B /{0!s}:/{0!s}'.format(bind)
        py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont)
        params.update({'python' : py})
        """
        Dask will generate a job script but some elements will be missing
        due to the way the singularity container with interface with slurm.
        The `modules` need to initialized and `singularity` needs to be added.
        """
        env_extra += [ 'source /etc/profile.d/modules.sh',
        'module add openmind/singularity/2.6.0']

    params.update({ 'job_extra' : job_extra,
                    'env_extra' : env_extra})

    cluster = SLURMCluster(**params)
    """
    Display the job script.
    """
    print(cluster.job_script())
    pprint(params)

    t0 = time.time()
    num_crunch(100)
    expected_dur = (time.time() - t0) * args.load
    print('Expected time of linear call: {0:f}'.format(expected_dur))

    if args.dry:
        return

    """
    Scale the cluster to the number of jobs.
    """
    print('Scaling by {}'.format(args.n_jobs))
    cluster.scale_up(args.proc_per_job * args.n_jobs)

    """
    Setup a client that interfaces with the workers
    """
    client = distributed.Client(cluster)
    time.sleep(10)
    print(cluster)
    print(client)
    pprint(client.has_what())
    # pprint(client.scheduler_info())
    """
    Generate a transaction.
    """
    futures = client.map(num_crunch, range(args.load))
    t0 = time.time()

    """
    Compute (and then discard) while keeping track of progress.
    """
    distributed.progress(futures)
    dur = time.time() - t0
    msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur),
                                                    expected_dur, dur)
    print(msg)
    msg = 'Ideal speed up is {0:f}x'.format(n_procs)
    print(msg)
    """
Beispiel #2
0
async def run():
    number_of_cores_per_node = 16  # DAS-5 features 2x8 NUMA cores per compute node
    reservation_length = "08:00:00"  # 2 hours is more than enough... probably
    cluster = SLURMCluster(cores=number_of_cores_per_node,
                           memory="64 GB",
                           processes=4,
                           scheduler_options={"dashboard_address": ":6868"},
                           local_directory="./aip-logs",
                           interface='ib0',
                           walltime=reservation_length)

    # Grab 5 execution nodes -> 80 cores
    print("Scaling up, getting 5 nodes")
    cluster.scale_up(5)
    client = Client(cluster)

    print("Client is ready, parsing data files...")

    file_locations = "/var/scratch/lvs215/aip_tmp"
    data_files = []

    # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around
    for path, subdirs, files in os.walk(file_locations):
        for name in files:
            if isfile(os.path.join(path, name)) and not name.endswith(
                ("gz", "zip", "tar")):
                data_files.append(os.path.join(path, name))

    client.run(clear_all_files)

    # Create one task per file.
    print(data_files)
    print("Creating and executing tasks...")
    tasks = list(map(delayed(process_file), data_files))
    true_false_array = db.from_delayed(tasks)

    # DEBUG CODE
    # future = client.compute(true_false_array)
    # client.recreate_error_locally(future)

    # Time to compute them!
    start = datetime.datetime.now()
    res = true_false_array.compute()
    end = datetime.datetime.now()
    print(true_false_array)
    print(res)
    print("Tasks ran to completion! Copying databases.")
    if False not in true_false_array:  # If everything went alright, let all nodes copy their databases to the home dir.
        client.run(copy_database_to_home_folder)
        client.run(clear_all_files)
    else:
        print("Parsing one of the files went horribly wrong, quitting!")
        exit(-1)

    print("Beginning assembling of all databases into one!")
    # Now, each of the nodes has a local database file, we will now combine these databases into one.
    # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel.
    # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data
    database_manager = DatabaseManager(
    )  # This creates an empty aip.db if it doesn't exists.
    con3 = database_manager.db  # Reuse the connection

    # based on https://stackoverflow.com/a/37138506
    os.makedirs(db_files_location, exist_ok=True)
    for file in [
            os.path.join(db_files_location, f)
            for f in os.listdir(db_files_location)
            if isfile(os.path.join(db_files_location, f)) and f.endswith(".db")
    ]:
        con3.execute("ATTACH '{}' as dba".format(file))

        con3.execute("BEGIN")
        for row in con3.execute(
                "SELECT * FROM dba.sqlite_master WHERE type='table'"):
            combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1]
            print(combine)
            con3.execute(combine)
        con3.execute("detach database dba")
        con3.commit()
        # Now, delete the database as it has been copied.
        # os.remove("{}.db".format(hash(worker)))
    print("All done. Releasing all nodes.")
    await cluster.scale_down(cluster.workers)
    print("Nodes released.")
    print(end - start)