def main(): # noqa: D103 os.makedirs(output_dir, exist_ok=True) cluster = dask_jobqueue.PBSCluster( name="STR_subset", walltime="48:00:00", log_directory=output_dir ) cluster.scale(22) client = dask.distributed.Client(cluster) jobs = [] for chrom in range(1, 23): jobs.append(subset_strs(chrom)) futures = client.compute(jobs) with open(f"{output_dir}/results.txt", 'w') as results_file: for chrom, future in enumerate(futures): chrom += 1 result = future.result() if result is None: results_file.write(f"chrom {chrom} succeeded\n\n") else: results_file.write( f"chrom {chrom} failed. Error: {result}\n\n" )
def open(self): """Initiate and scale the cluster""" # initiate the cluster object # Look at the ~/.config/dask/mintpy.yaml file for changing the Dask configuration defaults print('initiate Dask cluster') if self.cluster_type == 'local': from dask.distributed import LocalCluster # initiate cluster object self.cluster = LocalCluster() else: # for non-local cluster, import related dask module only when it's needed # because job_queue is not available on macports, which make sense import dask_jobqueue # initiate cluster object if self.cluster_type == 'lsf': self.cluster = dask_jobqueue.LSFCluster(**self.cluster_kwargs) elif self.cluster_type == 'pbs': self.cluster = dask_jobqueue.PBSCluster(**self.cluster_kwargs) elif self.cluster_type == 'slurm': self.cluster = dask_jobqueue.SLURMCluster( **self.cluster_kwargs) else: msg = 'un-recognized input cluster: {}'.format( self.cluster_type) msg += '\nsupported clusters: {}'.format(CLUSTER_LIST) raise ValueError(msg) # show dask cluster job script for reference print("\n", self.cluster.job_script()) # for debug debug_mode = False if debug_mode: with open('dask_command_run_from_python.txt', 'w') as f: f.write(self.cluster.job_script() + '\n') # This line submits num_worker jobs to the cluster to start a bunch of workers # In tests on Pegasus `general` queue in Jan 2019, no more than 40 workers could RUN # at once (other user's jobs gained higher priority in the general at that point) print('scale the cluster to {} workers'.format(self.num_worker)) self.cluster.scale(self.num_worker)
def main(): # noqa: D103 output_dir = f'{os.environ["UKB"]}/exome/test_output' cluster = dask_jobqueue.PBSCluster(name="test", walltime="4:00:00", log_directory=output_dir, queue="condo") # Maximum of 10 concurrent downloads per application # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644 cluster.adapt(minimum_jobs=10, maximum_jobs=10) client = dask.distributed.Client(cluster) jobs = set() jobs.add(fail()) futures = client.compute(jobs, retries=1) for future in futures: future.result() # block till code is done executing results_file.write(f"{future.key} succeeded\n")
def open(self): """Initiate the cluster""" # initiate the cluster object # Look at the ~/.config/dask/mintpy.yaml file for changing the Dask configuration defaults print('initiate Dask cluster') if self.cluster_type == 'local': from dask.distributed import LocalCluster # initiate cluster object self.cluster = LocalCluster() else: # for non-local cluster, import related dask module only when it's needed # because job_queue is not available on macports, which make sense import dask_jobqueue # initiate cluster object if self.cluster_type == 'lsf': self.cluster = dask_jobqueue.LSFCluster(**self.cluster_kwargs) elif self.cluster_type == 'pbs': self.cluster = dask_jobqueue.PBSCluster(**self.cluster_kwargs) elif self.cluster_type == 'slurm': self.cluster = dask_jobqueue.SLURMCluster( **self.cluster_kwargs) else: msg = 'un-recognized input cluster: {}'.format( self.cluster_type) msg += '\nsupported clusters: {}'.format(CLUSTER_LIST) raise ValueError(msg) # show dask cluster job script for reference print("\n", self.cluster.job_script()) # for debug debug_mode = False if debug_mode: with open('dask_command_run_from_python.txt', 'w') as f: f.write(self.cluster.job_script() + '\n')
def main(): # noqa: D103 parser = argparse.ArgumentParser() parser.add_argument("pipeline_name", choices={'fe', 'spb'}) parser.add_argument("bulk_file", help="name of a file in the exome directory") args = parser.parse_args() vcf_dir = f'{ukb}/exome/{args.pipeline_name}_vcfs' output_dir = f'{vcf_dir}_output' bulk_floc = f'{ukb}/exome/{args.bulk_file}' assert os.path.exists(vcf_dir) assert os.path.exists(bulk_floc) current_files = set(os.listdir(vcf_dir)) cluster = dask_jobqueue.PBSCluster( name="UKB_gVCF_download", walltime="4:00:00", log_directory=output_dir, queue="condo" ) # Maximum of 10 concurrent downloads per application # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644 cluster.adapt(minimum_jobs=10, maximum_jobs=10) client = dask.distributed.Client(cluster) jobs = set() # calculate number of download batches with open(bulk_floc) as bulk_file: for line in bulk_file: sample_ID, field_ID = line.split() if field_ID in {'23176_0_0', '23161_0_0'}: suffix = 'gz' elif field_ID == {'23177_0_0', '23162_0_0'}: suffix = 'tbi' file_name = f"{sample_ID}_{field_ID}.{suffix}" if file_name in current_files: continue jobs.add(client.submit( download_item, sample_ID, field_ID, vcf_dir, key=f'download_item-{sample_ID}-{field_ID}' )) print(f"Number of jobs queued: {len(jobs)}") retried_keys = set() now = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") with open(f"{output_dir}/results_{now}.txt", 'w') as results_file: for future in dask.distributed.as_completed(jobs): key = future.key err = future.exception() if err: print(f"{key} failed with raised error. Error: {err}", file=sys.stderr) if key in retried_keys: print(f"{key} was already retried.", file=sys.stderr) sys.exit(1) else: retried_keys.add(key) future.retry() continue result = future.result() if result is None: results_file.write(f"{key} succeeded\n") else: results_file.write(f"{key} failed. Error: {result}\n\n") if key in retried_keys: print(f"{key} was already retried.", file=sys.stderr) sys.exit(1) else: retried_keys.add(key) future.retry() continue # make sure to mark the future as cancelled so it is not rerun # even if the job it was on dies unexpectedly and is restarted future.cancel()