def __init__(self, bams, sample_ids, jar, iter_per_percentage, min_reads, max_reads, step_size, reads_multiplier, random_seed_base, out_dir, name, out_sh=None, submit=True, queue_type='PBS'): """Any CamelCase here is directly copied from the STAR inputs for complete compatibility """ # Make the directory if it's not there already try: os.mkdir(out_dir) except OSError: pass downsample_command = 'java -jar {}'.format(jar) commands = [] for bam, sample_id in zip(bams, sample_ids): flagstat = 'samtools flagstat {}'.format(bam) p = subprocess.Popen(shlex.split(flagstat), stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() n_read1 = float(out.split('\n')[4].split()[0]) / reads_multiplier vmin = min_reads vmax = n_read1 if max_reads is None else max_reads for reads in np.arange(vmin, vmax, step_size): downsample_prob = reads / n_read1 for i in range(iter_per_percentage): out_bam = '{}_{:.1e}reads_iter{}.bam'.format( sample_id, reads, i) random_seed = random_seed_base + i commands.append( '{} INPUT={} OUTPUT={} RANDOM_SEED={} PROBABILITY={} CREATE_INDEX=true' .format(downsample_command, bam, out_bam, random_seed, downsample_prob)) sub = Submitter(sh_filename=out_sh, queue_type=queue_type, commands=commands, job_name=name, walltime='1:00:00', nodes=1, ppn=1, queue='home', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh, directory='./', queue_type='PBS', submit=False, downsampled=False): """ Given a base folder which has a directory called "miso" where all the miso output is, search for bad events in the subfolders and then write them to a "nan_events.txt" file for that sample and event type. Parameters ---------- job_name : str Name of the array job to be submitted out_sh : str Filename to write all the submitter commands to directory : str Base directory, which has a "miso" directory there. This assumes the following directory structure: <directory>/miso/<sample_id>/<event_type> Where "<directory>" is the location specified through this variable. If you ran your MISO samples using the Yeo Lab pipeline, you're fine. """ downsampled = '--downsampled' if downsampled else '' commands = ['python concatenate_miso.py --directory {} {}'.format( directory, downsampled)] sub = Submitter(queue_type=queue_type, job_name=job_name, sh_filename=out_sh, commands=commands, nodes=1, ppn=1, queue='home', array=False, max_running=20, ) sub.write_sh(submit=submit)
def __init__(self, fasta, kmer_size, job_name='sailfish_index', num_processors=8, out_sh=None, out_dir=None, submit=False): if num_processors > 16: raise ValueError('At most 16 processors can be specified, ' 'but you ' 'asked for {}'.format(num_processors)) if kmer_size > 31: raise ValueError('Maximum kmer size is 31 due to memory ' 'limitations but "{}" was specified'.format( kmer_size)) if out_dir is None: out_dir = '{}_sailfish_index_k{}'.format(fasta, kmer_size) else: out_dir = out_dir if out_sh is None: out_sh = job_name + '.sh' else: out_sh = out_sh command = 'sailfish index --transcripts {0} --out {1} --kmerSize {2} ' \ '--threads {3}'.format(fasta, out_dir, kmer_size, num_processors) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=[command], job_name=job_name, nodes=1, ppn=num_processors, queue='home', walltime='0:30:00') sub.write_sh(submit=submit)
def __init__(self, bams, sample_ids, jar, iter_per_percentage, min_reads, max_reads, step_size, reads_multiplier, random_seed_base, out_dir, name, out_sh=None, submit=True, queue_type='PBS'): """Any CamelCase here is directly copied from the STAR inputs for complete compatibility """ # Make the directory if it's not there already try: os.mkdir(out_dir) except OSError: pass downsample_command = 'java -jar {}'.format(jar) commands = [] for bam, sample_id in zip(bams, sample_ids): flagstat = 'samtools flagstat {}'.format(bam) p = subprocess.Popen(shlex.split(flagstat), stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() n_read1 = float(out.split('\n')[4].split()[0]) / reads_multiplier vmin = min_reads vmax = n_read1 if max_reads is None else max_reads for reads in np.arange(vmin, vmax, step_size): downsample_prob = reads / n_read1 for i in range(iter_per_percentage): out_bam = '{}_{:.1e}reads_iter{}.bam'.format(sample_id, reads, i) random_seed = random_seed_base + i commands.append( '{} INPUT={} OUTPUT={} RANDOM_SEED={} PROBABILITY={} CREATE_INDEX=true'.format( downsample_command, bam, out_bam, random_seed, downsample_prob)) sub = Submitter(sh_filename=out_sh, queue_type=queue_type, commands=commands, job_name=name, walltime='1:00:00', nodes=1, ppn=1, queue='home', array=True, max_running=20) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh, directory='./', queue_type='PBS', submit=False): """ Given a base folder which has a directory called "miso" where all the miso output is, search for bad events in the subfolders and then write them to a "nan_events.txt" file for that sample and event type. Parameters ---------- job_name : str Name of the array job to be submitted out_sh : str Filename to write all the submitter commands to directory : str Base directory, which has a "miso" directory there. This assumes the following directory structure: <directory>/miso/<sample_id>/<event_type> Where "<directory>" is the location specified through this variable. If you ran your MISO samples using the Yeo Lab pipeline, you're fine. """ commands = [] glob_command = '{}/miso/*/*'.format(directory.rstrip('/')) for folder in iglob(glob_command): command = 'cd %s ; grep -m 1 nan chr*/*.miso > nan_events.txt' % \ folder commands.append(command) sub = Submitter( queue_type=queue_type, job_name=job_name, sh_filename=out_sh, commands=commands, nodes=1, ppn=1, queue='home', array=True, max_running=20, ) sub.write_sh(submit=submit)
def __init__(self, job_name, out_sh, directory='./', queue_type='PBS', submit=False, downsampled=False): """ Given a base folder which has a directory called "miso" where all the miso output is, search for bad events in the subfolders and then write them to a "nan_events.txt" file for that sample and event type. Parameters ---------- job_name : str Name of the array job to be submitted out_sh : str Filename to write all the submitter commands to directory : str Base directory, which has a "miso" directory there. This assumes the following directory structure: <directory>/miso/<sample_id>/<event_type> Where "<directory>" is the location specified through this variable. If you ran your MISO samples using the Yeo Lab pipeline, you're fine. """ downsampled = '--downsampled' if downsampled else '' commands = [ 'python concatenate_miso.py --directory {} {}'.format( directory, downsampled) ] sub = Submitter( queue_type=queue_type, job_name=job_name, sh_filename=out_sh, commands=commands, nodes=1, ppn=1, queue='home', array=False, max_running=20, ) sub.write_sh(submit=submit)
def test_pbs(self): """Test PBS queue (TSCC) """ job_name = 'test_qtools_submitter_pbs' submit_sh = '{}/{}.sh'.format(self.out_dir, job_name) sub = Submitter(queue_type='PBS', sh_filename=submit_sh, commands=self.commands, job_name=job_name, nodes=1, ppn=1, queue='home-yeo', walltime='0:01:00') job_id = sub.job(submit=False) true_result_string = '''#!/bin/bash #PBS -N test_qtools_submitter_pbs #PBS -o {0}/test_qtools_submitter_pbs.sh.out #PBS -e {0}/test_qtools_submitter_pbs.sh.err #PBS -V #PBS -l walltime=0:01:00 #PBS -l nodes=1:ppn=1 #PBS -A yeo-group #PBS -q home-yeo # Go to the directory from which the script was called cd $PBS_O_WORKDIR date echo testing '''.format(self.out_dir) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer # But only do this if we're on TSCC or oolite if ON_SERVER: self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def test_wait_for_array_pbs(self): commands = ['date', 'echo testing PBS'] job_name = 'test_qtools_submitter_wait_for_pbs' submit_sh = '%s/%s.sh' % (tests.get_test_dir(), job_name) sub = Submitter(queue_type='PBS', sh_file=submit_sh, command_list=commands, job_name=job_name, wait_for_array=['11111']) job_id = sub.write_sh(submit=True, nodes=1, ppn=16, queue='home-yeo', walltime='0:01:00') true_result_string = '''#!/bin/sh #PBS -N test_qtools_submitter_wait_for_pbs #PBS -o %s/test_qtools_submitter_wait_for_pbs.sh.out #PBS -e %s/test_qtools_submitter_wait_for_pbs.sh.err #PBS -V #PBS -l walltime=0:01:00 #PBS -l nodes=1:ppn=16 #PBS -A yeo-group #PBS -q home-yeo #PBS -W depend=afterokarray:11111 # Go to the directory from which the script was called cd $PBS_O_WORKDIR date echo testing PBS ''' % (tests.get_test_dir(), tests.get_test_dir()) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def __init__(self, read1, read2, out_dir, index, stranded=False, not_gzipped=False, job_name='sailfish_quant', num_processors=8, out_sh=None, submit=False, queue_name='home'): paired_end = True if read2 is not None else False library_parameters = ['TYPE=PE', 'ORIENTATION=><'] if paired_end \ else ['TYPE=SE'] if stranded: if paired_end: strand = 'STRAND=AS' else: strand = 'STRAND=A' else: strand = 'STRAND=U' library_parameters.append(strand) library_string = ':'.join(library_parameters) if not_gzipped: read_template = r'{}' else: read_template = r'<(gunzip -c {})' if read2 is not None: read1 = '-1 {}'.format(read_template.format(read1)) read2 = '-2 {}'.format(read_template.format(read2)) reads = '{} {}'.format(read1, read2) else: reads = '-r {}'.format(read_template.format(read1)) command = 'sailfish quant --index {0} -l "{1}" {2} --out {3} --threads ' \ '{4}'.format(index, library_string, reads, out_dir, num_processors) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=[command], job_name=job_name, nodes=1, ppn=num_processors, queue=queue_name, walltime='0:30:00') sub.write_sh(submit=submit)
def test_pbs(self): """Test PBS queue (TSCC) """ job_name = 'test_qtools_submitter_pbs' submit_sh = '{}/{}.sh'.format(self.out_dir, job_name) sub = Submitter(queue_type='PBS', sh_filename=submit_sh, commands=self.commands, job_name=job_name, nodes=1, ppn=1, queue='home-yeo', walltime='0:01:00' ) job_id = sub.job(submit=False) true_result_string = '''#!/bin/bash #PBS -N test_qtools_submitter_pbs #PBS -o {0}/test_qtools_submitter_pbs.sh.out #PBS -e {0}/test_qtools_submitter_pbs.sh.err #PBS -V #PBS -l walltime=0:01:00 #PBS -l nodes=1:ppn=1 #PBS -A yeo-group #PBS -q home-yeo # Go to the directory from which the script was called cd $PBS_O_WORKDIR date echo testing '''.format(self.out_dir) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer # But only do this if we're on TSCC or oolite if ON_SERVER: self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def __init__(self, genomeDir, genomeFastaFiles, sjdb, sjdbOverhang, job_name, out_sh=None, submit=True): """Any CamelCase here is directly copied from the STAR inputs for complete compatibility """ # Make the directory if it's not there already try: os.mkdir(genomeDir) except OSError: pass commands = [] commands.append('STAR --runMode genomeGenerate --genomeDir {0} ' '--genomeFastaFiles {1} --runThreadN 16 {2} ' '--sjdbOverhang {3}'.format( genomeDir, genomeFastaFiles, sjdb, sjdbOverhang)) sub = Submitter(queue_type='PBS', sh_filename=out_sh, commands=commands, job_name=job_name, nodes=1, ppn=16, queue='home', walltime='4:00:00') sub.job(submit=submit)
def test_sge(self): """Test SGE queue (oolite) """ job_name = 'test_qtools_submitter_sge' submit_sh = '{}/{}.sh'.format(self.out_dir, job_name) sub = Submitter(queue_type='SGE', sh_filename=submit_sh, commands=self.commands, job_name=job_name, nodes=1, ppn=1, queue='home-yeo', walltime='0:01:00') job_id = sub.job(submit=False) true_result_string = '''#!/bin/bash #$ -N test_qtools_submitter_sge #$ -o {0}/test_qtools_submitter_sge.sh.out #$ -e {0}/test_qtools_submitter_sge.sh.err #$ -V #$ -S /bin/bash #$ -cwd #$ -l bigmem #$ -l h_vmem=16G date echo testing '''.format(self.out_dir) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer # But only do this if we're on TSCC or oolite if ON_SERVER: self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def __init__(self, job_name, out_sh, directory='./', queue_type='PBS', submit=False): """ Given a base folder which has a directory called "miso" where all the miso output is, search for bad events in the subfolders and then write them to a "nan_events.txt" file for that sample and event type. Parameters ---------- job_name : str Name of the array job to be submitted out_sh : str Filename to write all the submitter commands to directory : str Base directory, which has a "miso" directory there. This assumes the following directory structure: <directory>/miso/<sample_id>/<event_type> Where "<directory>" is the location specified through this variable. If you ran your MISO samples using the Yeo Lab pipeline, you're fine. """ commands = [] glob_command = '{}/miso/*/*'.format(directory.rstrip('/')) for folder in iglob(glob_command): command = 'cd %s ; grep -m 1 nan chr*/*.miso > nan_events.txt' % \ folder commands.append(command) sub = Submitter(queue_type=queue_type, job_name=job_name, sh_filename=out_sh, commands=commands, nodes=1, ppn=1, queue='home', array=True, max_running=20, ) sub.write_sh(submit=submit)
def test_wait_for_array_pbs(self): commands = ['date', 'echo testing PBS'] job_name = 'test_qtools_submitter_wait_for_pbs' submit_sh = '%s/%s.sh' % (tests.get_test_dir(), job_name) sub = Submitter(queue_type='PBS', sh_file= submit_sh, command_list=commands, job_name=job_name, wait_for_array=['11111']) job_id = sub.write_sh(submit=True, nodes=1, ppn=16, queue='home-yeo', walltime='0:01:00') true_result_string = '''#!/bin/sh #PBS -N test_qtools_submitter_wait_for_pbs #PBS -o %s/test_qtools_submitter_wait_for_pbs.sh.out #PBS -e %s/test_qtools_submitter_wait_for_pbs.sh.err #PBS -V #PBS -l walltime=0:01:00 #PBS -l nodes=1:ppn=16 #PBS -A yeo-group #PBS -q home-yeo #PBS -W depend=afterokarray:11111 # Go to the directory from which the script was called cd $PBS_O_WORKDIR date echo testing PBS ''' % (tests.get_test_dir(), tests.get_test_dir()) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def test_sge(self): """Test SGE queue (oolite) """ job_name = 'test_qtools_submitter_sge' submit_sh = '{}/{}.sh'.format(self.out_dir, job_name) sub = Submitter(queue_type='SGE', sh_filename=submit_sh, commands=self.commands, job_name=job_name, nodes=1, ppn=1, queue='home-yeo', walltime='0:01:00' ) job_id = sub.job(submit=False) true_result_string = '''#!/bin/bash #$ -N test_qtools_submitter_sge #$ -o {0}/test_qtools_submitter_sge.sh.out #$ -e {0}/test_qtools_submitter_sge.sh.err #$ -V #$ -S /bin/bash #$ -cwd #$ -l bigmem #$ -l h_vmem=16G date echo testing '''.format(self.out_dir) true_result = true_result_string.split('\n') # with open(submit_sh) as f: # for x in f.readlines(): # print x, for true, test in zip(true_result, open(submit_sh)): self.assertEqual(true.strip().split(), test.strip().split()) # Make sure the job ID is a single (potentially multi-digit) integer # But only do this if we're on TSCC or oolite if ON_SERVER: self.assertRegexpMatches(job_id, '^\d+$') subprocess.Popen(["qdel", job_id], stdout=PIPE)
def __init__(self, bam, sample_info_file, sample_id, output_sh, genome, walltime, submit=False): """ Parameters ---------- Returns ------- Raises ------ """ self.sample_info_file = sample_info_file if self.sample_info_file is not None: sample_info = pd.read_table(self.sample_info_file, header=None) self.bams = sample_info[0] self.sample_ids = sample_info[1] self.sh_files = ['{}.miso.sh'.format(bam) for bam in self.bams] self.multiple_samples = True else: self.sample_ids = [sample_id] self.bams = [bam] self.sh_files = [output_sh] self.multiple_samples = False self.genome = genome self.walltime = walltime self.submit = submit all_samples_commands = [] for bam, sample_id, sh_file in zip(self.bams, self.sample_ids, self.sh_files): self._write_single_sample(bam, sample_id, sh_file) sh_command = 'bash {}'.format(sh_file) if self.submit and not self.multiple_samples: commands = [sh_command] sub = Submitter(commands, job_name='miso', sh_filename='{}.qsub.sh'.format(sh_file), ppn=16, walltime=self.walltime) sub.job(submit=self.submit) if self.multiple_samples: all_samples_commands.append(sh_command) if self.multiple_samples: sub = Submitter(all_samples_commands, job_name='miso', sh_filename='miso.qsub.sh', array=True, ppn=16, walltime=self.walltime) sub.job(submit=self.submit)
def __init__(self, bam, sample_info_file, sample_id, output_sh, genome, walltime, nodes=1, ppn=16, submit=False, read_length=None): """ Parameters ---------- Returns ------- Raises ------ """ self.sample_info_file = sample_info_file if self.sample_info_file is not None: sample_info = pd.read_table(self.sample_info_file, header=None) self.bams = sample_info[0] self.sample_ids = sample_info[1] self.sh_files = ['{}.miso.sh'.format(bam) for bam in self.bams] self.multiple_samples = True else: self.sample_ids = [sample_id] self.bams = [bam] self.sh_files = [output_sh] self.multiple_samples = False self.genome = genome self.walltime = walltime self.submit = submit self.nodes = nodes self.ppn = ppn self.read_length = self.read_length all_samples_commands = [] for bam, sample_id, sh_file in zip(self.bams, self.sample_ids, self.sh_files): self._write_single_sample(bam, sample_id, sh_file) sh_command = 'bash {}'.format(sh_file) if self.submit and not self.multiple_samples: commands = [sh_command] sub = Submitter(commands, job_name='miso', sh_filename='{}.qsub.sh'.format(sh_file), ppn=self.ppn, nodes=self.nodes, walltime=self.walltime) sub.job(submit=self.submit) if self.multiple_samples: all_samples_commands.append(sh_command) if self.multiple_samples: sub = Submitter(all_samples_commands, job_name='miso', sh_filename='miso.qsub.sh', array=True, ppn=self.ppn, nodes=self.nodes, walltime=self.walltime) sub.job(submit=self.submit)
def summary(self): summary_commands = [] job_name_base = '%s_summary' % (self.job_name_prefix) job_name = job_name_base submit_sh = '%s/%s.sh' \ % (self.sh_scripts_dir, job_name_base) # all_submit_sh = [] summary_commands = [] for bam, sample_id, psi_output_dir, summary_output_dir in \ zip(self.bams, self.sample_ids, self.psi_output_dirs, self.summary_output_dirs): # Okay, now we are ready to write to the submitter script summary_commands.append('\n\n# --- %s --- #' % sample_id) # add a line of padding and the sample id to the output file summary_commands.append('\necho\necho "--- %s ----"' % sample_id) summary_commands.append('date') summary_command = 'python %s/run_miso.py --summarize-samples %s ' \ '%s >%s/summary.out 2>%s/summary.err' \ % (self.miso_scripts_dir, psi_output_dir, psi_output_dir, psi_output_dir, psi_output_dir) summary_commands.append(summary_command) summary_commands.append('# Copy over the summary files AFTERWARD ' 'to prevent' ' overloading the home directory') temp_summary_file = '%s/summary/%s.miso_summary' % ( psi_output_dir, sample_id) final_summary_file = '%s/summary/%s.miso_summary' % ( summary_output_dir, sample_id) summary_commands.append('mkdir -p %s/summary' % ( summary_output_dir)) summary_commands.append('cp %s %s' % (temp_summary_file, final_summary_file)) # Put the submitter script wherever the command was run from # if self.submit_sh_suffix: # else: # job_name = 'miso_%s_summary' % self.event_type # job_name = '%s_%s' % (sample_id, job_name_base) # submit_sh = '%s_%s.sh' \ # % (submit_sh_base, sample_id) # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' % # (sample_id, submit_sh)) # if self.num_cores > 1: additional_resources = {'-t': '1-%d%%%d' % (len(self.sample_ids), 8)} # else: # additional_resources = None # if self.psi_job_id[sample_id] is not None: # sub = Submitter(queue_type='PBS', sh_file=submit_sh, # command_list=summary_commands, # job_name=job_name, # wait_for=[self.psi_job_id[sample_id]], # # Tell the queue to parallelize this job # # into a job array # additional_resources=additional_resources) # else: sub = Submitter(queue_type='PBS', sh_file=submit_sh, command_list=summary_commands, job_name=job_name, # Tell the queue to parallelize this job # into a job array additional_resources=additional_resources, wait_for_array=self.psi_job_id) self.summary_job_id = sub.write_sh(submit=True, nodes=self.num_cores, ppn=2, queue=self.queue, walltime=self.summary_walltime) print self.summary_job_id # Save all the qsub commands in one file # with open('%s.sh' % submit_sh_base, 'w') as f: # # f.write('#!/bin/bash\n\n') # f.writelines(all_submit_sh)
def psi(self): """ Submit a job to the cluster to compute 'psi' (percent spliced-in) scores of the splicing events and bam files provided. """ psi_name = '%s_psi' % self.job_name_prefix job_name = psi_name submit_sh_base = '%s/%s' % (self.sh_scripts_dir, psi_name) submit_sh = '%s.sh' % submit_sh_base # all_submit_sh = ['#!/bin/bash\n\n'] psi_commands = [] # Make a different submit file for each sample, because MISO doesn't # take THAT long on its own for one sample, and that way we won't get # charged. Plus then we can track failures of individual samples for bam, sample_id, output_dir in zip(self.bams, self.sample_ids, self.psi_output_dirs): # Establish which files we're working with insert_len_file = bam + '.insert_len' # bam_dir = os.path.dirname(bam) # output_dir = '%s/miso/%s/%s' % (bam_dir, self.event_type, # sample_id) insert_len_commands, insert_len_arguments = self\ ._get_psi_insert_len_argument(sample_id, insert_len_file) # Okay, now we are ready to write to the submitter script psi_commands.append('\n\n# --- %s --- #' % sample_id) # Need to **extend** with a list, not append. psi_commands.extend(insert_len_commands) # add a line of padding and the sample id to the output file psi_commands.append('\necho\necho "--- %s ----"' % sample_id) psi_commands.append('date') # Get the read length. Gonna keep this as bash because samtools # and less are very fast read_len = '%s_READ_LEN' % sample_id psi_commands.append( '\n# Assuming that the first read of the bam file is ' 'representative, such that all the reads in the ' '\n# file are exactly the same length, we can take the first ' 'read from the bam file and measure its length, ' '\n# and use that for our algorithm') psi_commands.append( "%s=$(samtools view %s | head -n 1 | cut -f 10 | awk '{ print" " length }')" % (read_len, bam)) # Finally we are ready to write the actual miso command! log_filename = 'psi' stderr = '%s/%s.err' % (output_dir, log_filename) stdout = '%s/%s.out' % (output_dir, log_filename) psi_command = 'python %s --run %s %s --output-dir %s ' \ '--read-len $%s %s -p %d %s >' \ ' %s 2> %s' \ % (self.miso, self.event_type_index, bam, output_dir, read_len, insert_len_arguments, self.num_processes, self.extra_miso_arguments, stdout, stderr) psi_commands.append('date') psi_commands.append("echo Starting ...... '%s'" % psi_command) psi_commands.append(psi_command) # Put the submitter script wherever the command was run from # if self.submit_sh_suffix: # else: # psi_name = 'miso_%s_psi' % (self.event_type) # job_name = '%s_%s' % (sample_id, psi_name) # submit_sh = '%s_%s.sh' % (submit_sh_base, sample_id) # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' % # (sample_id, submit_sh)) # if self.insert_len_job_id is not None: # sub = Submitter(queue_type='PBS', sh_file=submit_sh, # command_list=psi_commands, job_name=job_name, # wait_for=[self.insert_len_job_id[sample_id]]) # else: sub = Submitter(queue_type='PBS', sh_file=submit_sh, command_list=psi_commands, job_name=job_name) # if self.num_cores == 1: # self.psi_job_is_array = False # self.psi_job_id[sample_id] = sub.write_sh(submit=True, # nodes=self.num_cores, # ppn=self.num_processes, # queue=self.queue, # walltime=self.psi_walltime) # else: self.psi_job_is_array = True self.psi_job_id = sub.write_sh( submit=True, nodes=self.num_cores, ppn=self.num_processes, queue=self.queue, walltime=self.psi_walltime, additional_resources={'-t': '1-%d%%%d' % (len(self.sample_ids), 8)}) print self.psi_job_id
def insert_len(self): """ For the provided .bam files, checks if there is an insert length file associated with it (....bam.insert_len), and if not, adds the command to compute its insert length to a list. Outputs the job ID of the insert_len script """ # If we are treating these as single-ended reads, don't do anything if self.read_type == 'single_end': return constitutive_exons_dir = '%s/%s_constitutive' % ( self.base_annotation_dir, self.event_type) # Bug: there may be more than one constitutive exons GFF in this # folder, and we only grab the first one constitutive_exons_gff = glob('%s/*.gff' % constitutive_exons_dir)[0] insert_len_name = '%s_insert_len%s' % (self.job_name_prefix, self.submit_sh_suffix) insert_len_sh_base = '%s/%s' % (self.sh_scripts_dir, insert_len_name) all_insert_len_sh = ['#!/bin/bash\n\n'] for bam, sample_id in zip(self.bams, self.sample_ids): # Command-line commands to submit to the cluster insert_len_commands = [] bam_dir = os.path.dirname(bam) insert_len_file = bam + '.insert_len' try: open(insert_len_file) except IOError: # There is no insert length file, so create it insert_len_command = 'python %s/pe_utils.py ' \ '--compute-insert-len %s %s ' \ ' --output-dir %s ' \ '>%s.out 2>%s'\ % (self.miso_scripts_dir, bam, constitutive_exons_gff, bam_dir, insert_len_file, insert_len_file) insert_len_commands.append('date') insert_len_commands.append("echo Starting ... '%s'" % insert_len_command) insert_len_commands.append(insert_len_command) # if self.submit_sh_suffix: # else: # insert_len_name = 'miso_insert_len' insert_len_sh = '%s_%s.sh' % (insert_len_sh_base, sample_id) all_insert_len_sh.append('\n# --- %s --- #\nqsub %s\n' % (sample_id, insert_len_sh)) sub = Submitter(queue_type='PBS', sh_file=insert_len_sh, command_list=insert_len_commands, job_name=insert_len_name) self.insert_len_job_id[sample_id] = sub.write_sh(submit=True, nodes=self.num_cores, ppn=self.num_processes, queue=self.queue, walltime='0:30:00')
print species except: print "usage: submit_parse_oldsplice.py <species>" assert (species != None) and (len(species) > 0) for filename in files: filenames.append(filename) sample = filename.replace(".splices", "").replace(".flip", "_flip") samples.append(sample) from gscripts.qtools import Submitter sub = Submitter() cmd = "parse_oldsplice.py --species %s" %species for filename, sample in zip(filenames, samples): cmd += " --sample %s %s " %(filename, sample) #print cmd cmd = [cmd] sub.job(command_list=cmd, array=False, sh_file="parse.sh", job_name="parse", submit=True, queue="home", ppn=1)
def insert_len(self): """ For the provided .bam files, checks if there is an insert length file associated with it (....bam.insert_len), and if not, adds the command to compute its insert length to a list. Outputs the job ID of the insert_len script """ # If we are treating these as single-ended reads, don't do anything if self.read_type == 'single_end': return constitutive_exons_dir = '%s/%s_constitutive' % ( self.base_annotation_dir, self.event_type) # Bug: there may be more than one constitutive exons GFF in this # folder, and we only grab the first one constitutive_exons_gff = glob('%s/*.gff' % constitutive_exons_dir)[0] insert_len_name = '%s_insert_len%s' % (self.job_name_prefix, self.submit_sh_suffix) insert_len_sh_base = '%s/%s' % (self.sh_scripts_dir, insert_len_name) all_insert_len_sh = ['#!/bin/bash\n\n'] for bam, sample_id in zip(self.bams, self.sample_ids): # Command-line commands to submit to the cluster insert_len_commands = [] bam_dir = os.path.dirname(bam) insert_len_file = bam + '.insert_len' try: open(insert_len_file) except IOError: # There is no insert length file, so create it insert_len_command = 'python %s/pe_utils.py ' \ '--compute-insert-len %s %s ' \ ' --output-dir %s ' \ '>%s.out 2>%s'\ % (self.miso_scripts_dir, bam, constitutive_exons_gff, bam_dir, insert_len_file, insert_len_file) insert_len_commands.append('date') insert_len_commands.append("echo Starting ... '%s'" % insert_len_command) insert_len_commands.append(insert_len_command) # if self.submit_sh_suffix: # else: # insert_len_name = 'miso_insert_len' insert_len_sh = '%s_%s.sh' % (insert_len_sh_base, sample_id) all_insert_len_sh.append('\n# --- %s --- #\nqsub %s\n' % (sample_id, insert_len_sh)) sub = Submitter(queue_type='PBS', sh_file=insert_len_sh, command_list=insert_len_commands, job_name=insert_len_name) self.insert_len_job_id[sample_id] = sub.write_sh( submit=True, nodes=self.num_cores, ppn=self.num_processes, queue=self.queue, walltime='0:30:00')
def psi(self): """ Submit a job to the cluster to compute 'psi' (percent spliced-in) scores of the splicing events and bam files provided. """ psi_name = '%s_psi' % self.job_name_prefix job_name = psi_name submit_sh_base = '%s/%s' % (self.sh_scripts_dir, psi_name) submit_sh = '%s.sh' % submit_sh_base # all_submit_sh = ['#!/bin/bash\n\n'] psi_commands = [] # Make a different submit file for each sample, because MISO doesn't # take THAT long on its own for one sample, and that way we won't get # charged. Plus then we can track failures of individual samples for bam, sample_id, output_dir in zip(self.bams, self.sample_ids, self.psi_output_dirs): # Establish which files we're working with insert_len_file = bam + '.insert_len' # bam_dir = os.path.dirname(bam) # output_dir = '%s/miso/%s/%s' % (bam_dir, self.event_type, # sample_id) insert_len_commands, insert_len_arguments = self\ ._get_psi_insert_len_argument(sample_id, insert_len_file) # Okay, now we are ready to write to the submitter script psi_commands.append('\n\n# --- %s --- #' % sample_id) # Need to **extend** with a list, not append. psi_commands.extend(insert_len_commands) # add a line of padding and the sample id to the output file psi_commands.append('\necho\necho "--- %s ----"' % sample_id) psi_commands.append('date') # Get the read length. Gonna keep this as bash because samtools # and less are very fast read_len = '%s_READ_LEN' % sample_id psi_commands.append( '\n# Assuming that the first read of the bam file is ' 'representative, such that all the reads in the ' '\n# file are exactly the same length, we can take the first ' 'read from the bam file and measure its length, ' '\n# and use that for our algorithm') psi_commands.append( "%s=$(samtools view %s | head -n 1 | cut -f 10 | awk '{ print" " length }')" % (read_len, bam)) # Finally we are ready to write the actual miso command! log_filename = 'psi' stderr = '%s/%s.err' % (output_dir, log_filename) stdout = '%s/%s.out' % (output_dir, log_filename) psi_command = 'python %s --run %s %s --output-dir %s ' \ '--read-len $%s %s -p %d %s >' \ ' %s 2> %s' \ % (self.miso, self.event_type_index, bam, output_dir, read_len, insert_len_arguments, self.num_processes, self.extra_miso_arguments, stdout, stderr) psi_commands.append('date') psi_commands.append("echo Starting ...... '%s'" % psi_command) psi_commands.append(psi_command) # Put the submitter script wherever the command was run from # if self.submit_sh_suffix: # else: # psi_name = 'miso_%s_psi' % (self.event_type) # job_name = '%s_%s' % (sample_id, psi_name) # submit_sh = '%s_%s.sh' % (submit_sh_base, sample_id) # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' % # (sample_id, submit_sh)) # if self.insert_len_job_id is not None: # sub = Submitter(queue_type='PBS', sh_file=submit_sh, # command_list=psi_commands, job_name=job_name, # wait_for=[self.insert_len_job_id[sample_id]]) # else: sub = Submitter(queue_type='PBS', sh_file=submit_sh, command_list=psi_commands, job_name=job_name) # if self.num_cores == 1: # self.psi_job_is_array = False # self.psi_job_id[sample_id] = sub.write_sh(submit=True, # nodes=self.num_cores, # ppn=self.num_processes, # queue=self.queue, # walltime=self.psi_walltime) # else: self.psi_job_is_array = True self.psi_job_id = sub.write_sh(submit=True, nodes=self.num_cores, ppn=self.num_processes, queue=self.queue, walltime=self.psi_walltime, additional_resources={ '-t': '1-%d%%%d' % (len(self.sample_ids), 8) }) print self.psi_job_id
def summary(self): summary_commands = [] job_name_base = '%s_summary' % (self.job_name_prefix) job_name = job_name_base submit_sh = '%s/%s.sh' \ % (self.sh_scripts_dir, job_name_base) # all_submit_sh = [] summary_commands = [] for bam, sample_id, psi_output_dir, summary_output_dir in \ zip(self.bams, self.sample_ids, self.psi_output_dirs, self.summary_output_dirs): # Okay, now we are ready to write to the submitter script summary_commands.append('\n\n# --- %s --- #' % sample_id) # add a line of padding and the sample id to the output file summary_commands.append('\necho\necho "--- %s ----"' % sample_id) summary_commands.append('date') summary_command = 'python %s/run_miso.py --summarize-samples %s ' \ '%s >%s/summary.out 2>%s/summary.err' \ % (self.miso_scripts_dir, psi_output_dir, psi_output_dir, psi_output_dir, psi_output_dir) summary_commands.append(summary_command) summary_commands.append('# Copy over the summary files AFTERWARD ' 'to prevent' ' overloading the home directory') temp_summary_file = '%s/summary/%s.miso_summary' % (psi_output_dir, sample_id) final_summary_file = '%s/summary/%s.miso_summary' % ( summary_output_dir, sample_id) summary_commands.append('mkdir -p %s/summary' % (summary_output_dir)) summary_commands.append('cp %s %s' % (temp_summary_file, final_summary_file)) # Put the submitter script wherever the command was run from # if self.submit_sh_suffix: # else: # job_name = 'miso_%s_summary' % self.event_type # job_name = '%s_%s' % (sample_id, job_name_base) # submit_sh = '%s_%s.sh' \ # % (submit_sh_base, sample_id) # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' % # (sample_id, submit_sh)) # if self.num_cores > 1: additional_resources = {'-t': '1-%d%%%d' % (len(self.sample_ids), 8)} # else: # additional_resources = None # if self.psi_job_id[sample_id] is not None: # sub = Submitter(queue_type='PBS', sh_file=submit_sh, # command_list=summary_commands, # job_name=job_name, # wait_for=[self.psi_job_id[sample_id]], # # Tell the queue to parallelize this job # # into a job array # additional_resources=additional_resources) # else: sub = Submitter( queue_type='PBS', sh_file=submit_sh, command_list=summary_commands, job_name=job_name, # Tell the queue to parallelize this job # into a job array additional_resources=additional_resources, wait_for_array=self.psi_job_id) self.summary_job_id = sub.write_sh(submit=True, nodes=self.num_cores, ppn=2, queue=self.queue, walltime=self.summary_walltime) print self.summary_job_id