def fyrd_estimate_pvals(psi, ase, n_reps, min_periods=None, n_genes_per_job=100): import fyrd outs = {} jobs = [] for i in range(0, len(psi), n_genes_per_job): jobs.append(fyrd.submit(estimate_all_pvals, (psi.iloc[i:i+n_genes_per_job], ase, n_reps, min_periods), **cluster_args)) for i in tqdm(list(range(len(jobs)))): job = jobs[i] res = job.get() for ix in res.index: outs[ix] = res[ix] return pd.Series(outs)
def get_regions_parallel(positions, genome_file, base=0, count=7): """Return a list of regions surrounding a position. Will loop through each chromosome and search all positions in that chromosome in one batch. Lookup is serial per chromosome. Args: positions (dict): Dictionary of {chrom->positons} genome_file (str): Location of a genome fasta file or directory of files. If directory, file names must be <chrom_name>.fa[.gz]. Gzipped OK. base (int): Either 0 or 1, base of positions in your list count (int): Distance + and - the position to extract Returns: dict: {chrom->{postion->sequence}} """ outs = [] for chrom in positions.keys(): if os.path.isdir(genome_file): fa_file = get_fasta_file(genome_file, chrom) if not os.path.isfile(fa_file): raise FileNotFoundError('{} not found.'.format(genome_file)) mins = int(len(positions[chrom]) / 2000) + 60 time = str(_td(minutes=mins)) outs.append( fyrd.submit( get_regions, ({ chrom: positions[chrom] }, fa_file, base, count), cores=1, mem='6GB', time=time, )) final = {} for out in outs: final.update(out.get()) return final
def get_regions_parallel(positions, genome_file, base=0, count=7): """Return a list of regions surrounding a position. Will loop through each chromosome and search all positions in that chromosome in one batch. Lookup is serial per chromosome. Args: positions (dict): Dictionary of {chrom->positons} genome_file (str): Location of a genome fasta file or directory of files. If directory, file names must be <chrom_name>.fa[.gz]. Gzipped OK. base (int): Either 0 or 1, base of positions in your list count (int): Distance + and - the position to extract Returns: dict: {chrom->{postion->sequence}} """ outs = [] for chrom in positions.keys(): if os.path.isdir(genome_file): fa_file = get_fasta_file(genome_file, chrom) if not os.path.isfile(fa_file): raise FileNotFoundError('{} not found.'.format(genome_file)) mins = int(len(positions[chrom])/2000)+60 time = str(_td(minutes=mins)) outs.append( fyrd.submit( get_regions, ({chrom: positions[chrom]}, fa_file, base, count), cores=1, mem='6GB', time=time, ) ) final = {} for out in outs: final.update(out.get()) return final
def get_dinucleotides_parallel(positions, genome_file, base=0, return_as='list'): """Return a list of all + and - strand dinucleotides around each position. Will loop through each chromosome and search all positions in that chromosome in one batch. Lookup is parallel per chromosome. Args: positions (dict): Dictionary of {chrom->positons} genome_file (str): Location of a genome fasta file or directory of files. If directory, file names must be <chrom_name>.fa[.gz]. Gzipped OK. Directory is preferred in parallel mode. base (int): Either 0 or 1, base of positions in your list return_as (str): dict: Return a dictionary of: {chrom->{postion->{'ref': str, '+': tuple, '-': tuple}}} list: just returns two lists with no positions. df: return DataFrame Returns: (list, list): + strand dinucleotides, - strand dinucleotides. Returns a dict or instead if requested through return_as. """ outs = [] for chrom in positions.keys(): if os.path.isdir(genome_file): fa_file = get_fasta_file(genome_file, chrom) if not os.path.isfile(fa_file): raise FileNotFoundError('{} not found.'.format(genome_file)) mins = int(len(positions[chrom]) / 2000) + 45 time = str(_td(minutes=mins)) outs.append( fyrd.submit( get_dinucleotides, ({ chrom: positions[chrom] }, fa_file, base, return_as), cores=1, mem='6GB', time=time, )) if return_as == 'df': final = [] elif return_as == 'dict': final = {} else: final = ([], []) fyrd.wait(outs) print('Getting results') for out in outs: res = out.get() if return_as == 'df': if isinstance(res, dict): res = dict_to_df(res, base) final.append(res) elif return_as == 'dict': final.update(res) else: plus, minus = res final[0] += plus final[1] += minus if return_as == 'df': print('Joining dataframe') final = pd.concat(final) return final
def run_depict_permutation(sample_1, sample_2, prefix, cores=None, perms=100, run_path=None, depict_path=DEPICT, perm_start=None, **fyrd_args): """Run DEPICT repeatedly and return locations of output files. This function uses fyrd to submit cluster jobs, jobs will request 2*cores to run, and 12G of memory. Takes 20 minutes to run 2 permutations on a small cluster. Args: sample_1 (str): File name or path to file with rsids for sample 1 sample_2 (str): File name or path to file with rsids for sample 2 prefix (str): Name for the output directory, input file names will be used to set output files in this directory. cores (int): Number of cores to use *PER PROCESS* for DEPICT, defaults to 1/2 of available cores on the machine, meaning all cores will be used for run (1/2 each). perms (int): Number of permutations. run_path (str): Root directory to run in, defaults to current dir depict_path (str): Path to the DEPICT package, default set in file. perm_start (int): Number to start permutations from fyrd_args (dict): Fyrd keyword arguments, not required. Outputs: <prefix>/<sample_name>.geneprioritization.txt <prefix>/<sample_name>.loci.txt <prefix>/<sample_name>.tissueenrichment.txt <prefix>/<sample_name>.genesetenrichment.txt <prefix>/<sample_name>.log Returns: None: on success, else raises Exception if job fails. """ if not cores: cores = PARAM_NCORES check_depict(depict_path) run_path = run_path if run_path else _os.path.abspath('.') selfpath = _os.path.realpath(__file__) if not perm_start: counts = [] for fl in _os.listdir(run_path): if fl.startswith('{}_perm_'.format(prefix)): c = fl.split('_')[-1] if c.isdigit(): counts.append(int(c)) if counts: perm_start = max(counts) + 1 else: perm_start = 1 if not isinstance(perm_start, int): perm_start = 1 print('Starting permutation count at {}'.format(perm_start)) s1_rsids = [] s2_rsids = [] with open(sample_1) as fin: s1_rsids += fin.read().strip().split('\n') with open(sample_2) as fin: s2_rsids += fin.read().strip().split('\n') rsids = np.array(s1_rsids + s2_rsids) jobs = {} count = perm_start print('Running {} permutations'.format(perms)) imports = ["import permute_depict as depict", "from permute_depict import *"] ttl = perms pbar = pb(total=ttl, unit='perms') while perms: this_perm = np.random.permutation(rsids) new_sample_1_data = sorted(this_perm[:len(s1_rsids)]) new_sample_2_data = sorted(this_perm[len(s1_rsids):]) assert len(new_sample_1_data) == len(s1_rsids) assert len(new_sample_2_data) == len(s2_rsids) perm_path = _pth(run_path, 'perm_files') if not _os.path.isdir(perm_path): _os.mkdir(perm_path) new_sample_1 = _pth( _os.path.abspath(perm_path), _os.path.basename(sample_1) + '_perm_{}.txt'.format(count) ) new_sample_2 = _pth( _os.path.abspath(perm_path), _os.path.basename(sample_2) + '_perm_{}.txt'.format(count) ) with open(new_sample_1, 'w') as fout: fout.write('\n'.join(new_sample_1_data)) with open(new_sample_2, 'w') as fout: fout.write('\n'.join(new_sample_2_data)) new_prefix = '{}_perm_{}'.format(prefix, count) job_path = _pth(run_path, 'jobs') if not _os.path.isdir(job_path): _os.mkdir(job_path) jobs['perm_{}'.format(count)] = ( _fyrd.submit( run_depict, kwargs = dict(sample_1 = new_sample_1, sample_2 = new_sample_2, prefix = new_prefix, cores = cores, run_path = run_path, depict_path = depict_path), name = new_prefix, imports = ['import os as _os', 'from os.path import join as _pth', 'from subprocess import check_call as _call', 'import permute_depict as depict', 'from permute_depict import *'], cores = cores*2, mem = '12GB', scriptpath = job_path, outpath = job_path, runpath = run_path, syspaths = selfpath, **fyrd_args ) ) perms -= 1 count += 1 pbar.update() pbar.close() # Get output file information print('Permutation jobs submitted, waiting for results.') outputs = {} with pb(total=ttl, unit='results') as pbar: while len(outputs) < len(jobs): for name, job in jobs.items(): if name in outputs: continue job.update() if job.done: outs = job.get() outputs[name] = outs with open(_pth(run_path, name) + '.files.dict', 'wb') as fout: try: _pickle.dump(outs, fout) except TypeError: pass pbar.update() _sleep(1) print('Permutation jobs completed.') return outputs
def main(): """Run core functionality.""" # Our PID us = os.getpid() def exit_us(code=1): """Exit with code and delete lockfile if PID is us or dead.""" if os.path.isfile(LOCK_FILE): with open(LOCK_FILE) as ffin: ppid = int(ffin.read().strip()) if ppid == us or not check_pid(ppid): os.remove(LOCK_FILE) sys.exit(code) # Run exit_us at every exit (other than SIGKILL) atexit.register(exit_us) # Check we aren't already running if os.path.isfile(LOCK_FILE): with open(LOCK_FILE) as fin: pid = int(fin.read().strip()) if check_pid(pid) and not pid == us: sys.exit(0) else: os.remove(LOCK_FILE) # Lock the script with open(LOCK_FILE, 'w') as fout: fout.write(str(us)) # Time handling FMT = '%y%m%d-%H:%M:%S' NOW = dt.now() # Try to load old job data if os.path.isfile(DATA_FILE): with open(DATA_FILE) as fin: last_job = json.load(fin) else: last_job = None # Decide if we want to run again # Don't run is less than 12 hours or if old jobs are running if last_job: if (NOW-dt.strptime(last_job['time'], FMT)).seconds < WAIT_TIME: exit_us(0) # Only import fyrd when we have to as it can be slow sys.path.insert(0, PYTHON_LIB) import fyrd if last_job: queue = fyrd.queue.Queue('self') open_jobs = [queue[i] for i in last_job['jobs'] if str(i) in queue.jobs] if open_jobs: for job in open_jobs: if job.state in fyrd.queue.ACTIVE_STATES: exit_us(0) # Create a temp dir just in case if not os.path.isdir(TEMP): os.makedirs(TEMP) # Clean up that dir fyrd.clean_dir(TEMP, confirm=False) os.system('rm {}/reset_perms* 2>/dev/null'.format(TEMP)) os.system('rm {}/touch_pi_scratch* 2>/dev/null'.format(TEMP)) # Get scripts scripts = [ os.path.join(SCRPT_PATH, i) for i in ['reset_perms.sh', 'touch_pi_scratch.sh'] ] # Submit them sub_jobs = [] for script in scripts: sub_jobs.append( fyrd.submit( 'bash ' + script + ' >/dev/null 2>/dev/null', partition='hbfraser,hns,normal', cores=1, mem=4000, time='18:00:00', outfile='/dev/null', errfile='/dev/null', scriptpath=TEMP, runpath=TEMP, clean_files=True, clean_outputs=True, name=script.split('/')[-1].split('.')[0] ) ) # Convert jobs into ids only job_ids = [] for job in sub_jobs: job.update() job_ids.append(job.id) # Write out data job_data = {'time': NOW.strftime(FMT), 'jobs': job_ids} with open(DATA_FILE, 'w') as fout: json.dump(job_data, fout) # Done, force delete file os.remove(LOCK_FILE) return 0
def run_depict_permutation(sample_1, sample_2, prefix, cores=None, perms=100, run_path=None, depict_path=DEPICT, perm_start=None, **fyrd_args): """Run DEPICT repeatedly and return locations of output files. This function uses fyrd to submit cluster jobs, jobs will request 2*cores to run, and 12G of memory. Takes 20 minutes to run 2 permutations on a small cluster. Args: sample_1 (str): File name or path to file with rsids for sample 1 sample_2 (str): File name or path to file with rsids for sample 2 prefix (str): Name for the output directory, input file names will be used to set output files in this directory. cores (int): Number of cores to use *PER PROCESS* for DEPICT, defaults to 1/2 of available cores on the machine, meaning all cores will be used for run (1/2 each). perms (int): Number of permutations. run_path (str): Root directory to run in, defaults to current dir depict_path (str): Path to the DEPICT package, default set in file. perm_start (int): Number to start permutations from fyrd_args (dict): Fyrd keyword arguments, not required. Outputs: <prefix>/<sample_name>.geneprioritization.txt <prefix>/<sample_name>.loci.txt <prefix>/<sample_name>.tissueenrichment.txt <prefix>/<sample_name>.genesetenrichment.txt <prefix>/<sample_name>.log Returns: None: on success, else raises Exception if job fails. """ if not cores: cores = PARAM_NCORES check_depict(depict_path) run_path = run_path if run_path else _os.path.abspath('.') selfpath = _os.path.realpath(__file__) if not perm_start: counts = [] for fl in _os.listdir(run_path): if fl.startswith('{}_perm_'.format(prefix)): c = fl.split('_')[-1] if c.isdigit(): counts.append(int(c)) if counts: perm_start = max(counts) + 1 else: perm_start = 1 if not isinstance(perm_start, int): perm_start = 1 print('Starting permutation count at {}'.format(perm_start)) s1_rsids = [] s2_rsids = [] with open(sample_1) as fin: s1_rsids += fin.read().strip().split('\n') with open(sample_2) as fin: s2_rsids += fin.read().strip().split('\n') rsids = np.array(s1_rsids + s2_rsids) jobs = {} count = perm_start print('Running {} permutations'.format(perms)) imports = [ "import permute_depict as depict", "from permute_depict import *" ] ttl = perms pbar = pb(total=ttl, unit='perms') while perms: this_perm = np.random.permutation(rsids) new_sample_1_data = sorted(this_perm[:len(s1_rsids)]) new_sample_2_data = sorted(this_perm[len(s1_rsids):]) assert len(new_sample_1_data) == len(s1_rsids) assert len(new_sample_2_data) == len(s2_rsids) perm_path = _pth(run_path, 'perm_files') if not _os.path.isdir(perm_path): _os.mkdir(perm_path) new_sample_1 = _pth( _os.path.abspath(perm_path), _os.path.basename(sample_1) + '_perm_{}.txt'.format(count)) new_sample_2 = _pth( _os.path.abspath(perm_path), _os.path.basename(sample_2) + '_perm_{}.txt'.format(count)) with open(new_sample_1, 'w') as fout: fout.write('\n'.join(new_sample_1_data)) with open(new_sample_2, 'w') as fout: fout.write('\n'.join(new_sample_2_data)) new_prefix = '{}_perm_{}'.format(prefix, count) job_path = _pth(run_path, 'jobs') if not _os.path.isdir(job_path): _os.mkdir(job_path) jobs['perm_{}'.format(count)] = (_fyrd.submit( run_depict, kwargs=dict(sample_1=new_sample_1, sample_2=new_sample_2, prefix=new_prefix, cores=cores, run_path=run_path, depict_path=depict_path), name=new_prefix, imports=[ 'import os as _os', 'from os.path import join as _pth', 'from subprocess import check_call as _call', 'import permute_depict as depict', 'from permute_depict import *' ], cores=cores * 2, mem='12GB', scriptpath=job_path, outpath=job_path, runpath=run_path, syspaths=selfpath, **fyrd_args)) perms -= 1 count += 1 pbar.update() pbar.close() # Get output file information print('Permutation jobs submitted, waiting for results.') outputs = {} with pb(total=ttl, unit='results') as pbar: while len(outputs) < len(jobs): for name, job in jobs.items(): if name in outputs: continue job.update() if job.done: outs = job.get() outputs[name] = outs with open(_pth(run_path, name) + '.files.dict', 'wb') as fout: try: _pickle.dump(outs, fout) except TypeError: pass pbar.update() _sleep(1) print('Permutation jobs completed.') return outputs
def get_dinucleotides_parallel(positions, genome_file, base=0, return_as='list'): """Return a list of all + and - strand dinucleotides around each position. Will loop through each chromosome and search all positions in that chromosome in one batch. Lookup is parallel per chromosome. Args: positions (dict): Dictionary of {chrom->positons} genome_file (str): Location of a genome fasta file or directory of files. If directory, file names must be <chrom_name>.fa[.gz]. Gzipped OK. Directory is preferred in parallel mode. base (int): Either 0 or 1, base of positions in your list return_as (str): dict: Return a dictionary of: {chrom->{postion->{'ref': str, '+': tuple, '-': tuple}}} list: just returns two lists with no positions. df: return DataFrame Returns: (list, list): + strand dinucleotides, - strand dinucleotides. Returns a dict or instead if requested through return_as. """ outs = [] for chrom in positions.keys(): if os.path.isdir(genome_file): fa_file = get_fasta_file(genome_file, chrom) if not os.path.isfile(fa_file): raise FileNotFoundError('{} not found.'.format(genome_file)) mins = int(len(positions[chrom])/2000)+45 time = str(_td(minutes=mins)) outs.append( fyrd.submit( get_dinucleotides, ({chrom: positions[chrom]}, fa_file, base, return_as), cores=1, mem='6GB', time=time, ) ) if return_as == 'df': final = [] elif return_as == 'dict': final = {} else: final = ([], []) fyrd.wait(outs) print('Getting results') for out in outs: res = out.get() if return_as == 'df': if isinstance(res, dict): res = dict_to_df(res, base) final.append(res) elif return_as == 'dict': final.update(res) else: plus, minus = res final[0] += plus final[1] += minus if return_as == 'df': print('Joining dataframe') final = pd.concat(final) return final
def main(): """Run core functionality.""" # Our PID us = os.getpid() def exit_us(code=1): """Exit with code and delete lockfile if PID is us or dead.""" if os.path.isfile(LOCK_FILE): with open(LOCK_FILE) as ffin: ppid = int(ffin.read().strip()) if ppid == us or not check_pid(ppid): os.remove(LOCK_FILE) sys.exit(code) # Run exit_us at every exit (other than SIGKILL) atexit.register(exit_us) # Check we aren't already running if os.path.isfile(LOCK_FILE): with open(LOCK_FILE) as fin: pid = int(fin.read().strip()) if check_pid(pid) and not pid == us: sys.exit(0) else: os.remove(LOCK_FILE) # Lock the script with open(LOCK_FILE, 'w') as fout: fout.write(str(us)) # Time handling FMT = '%y%m%d-%H:%M:%S' NOW = dt.now() # Try to load old job data if os.path.isfile(DATA_FILE): with open(DATA_FILE) as fin: last_job = json.load(fin) else: last_job = None # Decide if we want to run again # Don't run is less than 12 hours or if old jobs are running if last_job: if (NOW - dt.strptime(last_job['time'], FMT)).seconds < WAIT_TIME: exit_us(0) # Only import fyrd when we have to as it can be slow sys.path.insert(0, PYTHON_LIB) import fyrd if last_job: queue = fyrd.queue.Queue('self') open_jobs = [ queue[i] for i in last_job['jobs'] if str(i) in queue.jobs ] if open_jobs: for job in open_jobs: if job.state in fyrd.queue.ACTIVE_STATES: exit_us(0) # Create a temp dir just in case if not os.path.isdir(TEMP): os.makedirs(TEMP) # Clean up that dir fyrd.clean_dir(TEMP, confirm=False) os.system('rm {}/reset_perms* 2>/dev/null'.format(TEMP)) os.system('rm {}/touch_pi_scratch* 2>/dev/null'.format(TEMP)) # Get scripts scripts = [ os.path.join(SCRPT_PATH, i) for i in ['reset_perms.sh', 'touch_pi_scratch.sh'] ] # Submit them sub_jobs = [] for script in scripts: sub_jobs.append( fyrd.submit('bash ' + script + ' >/dev/null 2>/dev/null', partition='hbfraser,hns,normal', cores=1, mem=4000, time='18:00:00', outfile='/dev/null', errfile='/dev/null', scriptpath=TEMP, runpath=TEMP, clean_files=True, clean_outputs=True, name=script.split('/')[-1].split('.')[0])) # Convert jobs into ids only job_ids = [] for job in sub_jobs: job.update() job_ids.append(job.id) # Write out data job_data = {'time': NOW.strftime(FMT), 'jobs': job_ids} with open(DATA_FILE, 'w') as fout: json.dump(job_data, fout) # Done, force delete file os.remove(LOCK_FILE) return 0