def main(): parser = argparse.ArgumentParser( description = 'Simple example for using dask-joqueue in SLURM') parser.add_argument('--proc_per_job', type = int, default = 1, help = 'Number of processes per job.') parser.add_argument('--cores_per_proc', type = float, default = 2, help = 'Number of cores per process.') parser.add_argument('--n_jobs', type = int, default = 1, help = 'Number of jobs') parser.add_argument('--array', type = int, default = 0, help = 'EXPERIMENTAL. If >0, then submit an job-array '+\ 'of this size. The total number of jobs will'+\ ' be `array * n_jobs`.') parser.add_argument('--container', type = str, help = 'Path to singularity container. If `None`, '+\ 'then assumes conda environment.') parser.add_argument('--qos', type = str, help = 'QOS to use.') parser.add_argument('--dry', action = 'store_true', help = 'Print job script and exit (no submission)') parser.add_argument('--load', type = int, default = 1000, help = 'Load for the function.') args = parser.parse_args() n_procs = args.proc_per_job * args.n_jobs params = { 'cores' : int(args.cores_per_proc * args.proc_per_job), 'memory' : '{0:d}00MB'.format(args.proc_per_job*5), 'processes' : args.proc_per_job, # The name to assign to each worker 'name' : 'dask_test' } job_extra = ['--requeue'] env_extra = [] if not args.qos is None: job_extra.append('--qos {}'.format(args.qos)) if args.array > 0: n_procs = n_procs * args.array job_extra.append('--array 0-{0:d}'.format(args.array - 1)) """ This is added to ensure that each worker has a unique ID. This may be unnecessary. """ env_extra.append( 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}') if not args.container is None: """ When using a container, dask needs to know how to enter the python environment. Note: The binding `-B..` is cluster(OpenMind) specific but can generalize. The binding is required since `singularity` will not bind by default. """ cont = os.path.normpath(args.container) bind = cont.split(os.sep)[1] bind = '-B /{0!s}:/{0!s}'.format(bind) py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont) params.update({'python' : py}) """ Dask will generate a job script but some elements will be missing due to the way the singularity container with interface with slurm. The `modules` need to initialized and `singularity` needs to be added. """ env_extra += [ 'source /etc/profile.d/modules.sh', 'module add openmind/singularity/2.6.0'] params.update({ 'job_extra' : job_extra, 'env_extra' : env_extra}) cluster = SLURMCluster(**params) """ Display the job script. """ print(cluster.job_script()) pprint(params) t0 = time.time() num_crunch(100) expected_dur = (time.time() - t0) * args.load print('Expected time of linear call: {0:f}'.format(expected_dur)) if args.dry: return """ Scale the cluster to the number of jobs. """ print('Scaling by {}'.format(args.n_jobs)) cluster.scale_up(args.proc_per_job * args.n_jobs) """ Setup a client that interfaces with the workers """ client = distributed.Client(cluster) time.sleep(10) print(cluster) print(client) pprint(client.has_what()) # pprint(client.scheduler_info()) """ Generate a transaction. """ futures = client.map(num_crunch, range(args.load)) t0 = time.time() """ Compute (and then discard) while keeping track of progress. """ distributed.progress(futures) dur = time.time() - t0 msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur), expected_dur, dur) print(msg) msg = 'Ideal speed up is {0:f}x'.format(n_procs) print(msg) """
async def run(): number_of_cores_per_node = 16 # DAS-5 features 2x8 NUMA cores per compute node reservation_length = "08:00:00" # 2 hours is more than enough... probably cluster = SLURMCluster(cores=number_of_cores_per_node, memory="64 GB", processes=4, scheduler_options={"dashboard_address": ":6868"}, local_directory="./aip-logs", interface='ib0', walltime=reservation_length) # Grab 5 execution nodes -> 80 cores print("Scaling up, getting 5 nodes") cluster.scale_up(5) client = Client(cluster) print("Client is ready, parsing data files...") file_locations = "/var/scratch/lvs215/aip_tmp" data_files = [] # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around for path, subdirs, files in os.walk(file_locations): for name in files: if isfile(os.path.join(path, name)) and not name.endswith( ("gz", "zip", "tar")): data_files.append(os.path.join(path, name)) client.run(clear_all_files) # Create one task per file. print(data_files) print("Creating and executing tasks...") tasks = list(map(delayed(process_file), data_files)) true_false_array = db.from_delayed(tasks) # DEBUG CODE # future = client.compute(true_false_array) # client.recreate_error_locally(future) # Time to compute them! start = datetime.datetime.now() res = true_false_array.compute() end = datetime.datetime.now() print(true_false_array) print(res) print("Tasks ran to completion! Copying databases.") if False not in true_false_array: # If everything went alright, let all nodes copy their databases to the home dir. client.run(copy_database_to_home_folder) client.run(clear_all_files) else: print("Parsing one of the files went horribly wrong, quitting!") exit(-1) print("Beginning assembling of all databases into one!") # Now, each of the nodes has a local database file, we will now combine these databases into one. # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel. # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data database_manager = DatabaseManager( ) # This creates an empty aip.db if it doesn't exists. con3 = database_manager.db # Reuse the connection # based on https://stackoverflow.com/a/37138506 os.makedirs(db_files_location, exist_ok=True) for file in [ os.path.join(db_files_location, f) for f in os.listdir(db_files_location) if isfile(os.path.join(db_files_location, f)) and f.endswith(".db") ]: con3.execute("ATTACH '{}' as dba".format(file)) con3.execute("BEGIN") for row in con3.execute( "SELECT * FROM dba.sqlite_master WHERE type='table'"): combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1] print(combine) con3.execute(combine) con3.execute("detach database dba") con3.commit() # Now, delete the database as it has been copied. # os.remove("{}.db".format(hash(worker))) print("All done. Releasing all nodes.") await cluster.scale_down(cluster.workers) print("Nodes released.") print(end - start)