def _annotate_gtf(self, read_length): annotate_gtf = Job(name='annotate_gtf') annotate_gtf.invoke('all', self._state_update % 'Generating annotation FASTA files') prefix = self._get_index_hash(read_length) # Inputs gtf = File('%s.gtf' % self._species.name) chromosomes = self._species.chromosomes for i in chromosomes: chr_i = File('%s/chr%s.fa' % (self._species.name, i)) # Uses annotate_gtf.uses(chr_i, link=Link.INPUT) # Outputs features = File('h%s/FEATURES.fa' % prefix) chrs = File('h%s/GENOME.fa' % prefix) splices = File('h%s/SPLICES.fa' % prefix) genes = File('h%s/GENE.fa' % prefix) # Arguments annotate_gtf.addArguments(gtf, '-c', self._species.name, '-p h%s/' % prefix, '-l %d' % read_length) # Uses annotate_gtf.uses(gtf, link=Link.INPUT) annotate_gtf.uses(features, link=Link.OUTPUT, transfer=False, register=False) annotate_gtf.uses(chrs, link=Link.OUTPUT, transfer=False, register=False) annotate_gtf.uses(splices, link=Link.OUTPUT, transfer=False, register=False) annotate_gtf.uses(genes, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(annotate_gtf)
def _analyze(self): analyze = Job(name='analyze_samfile') analyze.invoke('all', self._state_update % 'Analyzing SAM file') # Input files sam_file = File('%s.sam' % self._prefix) # Output files genes_counts = File('%s.gene.cnts' % self._prefix) features_counts = File('%s.feature.cnts' % self._prefix) ambiguous_genes_counts = File('%s.ambiguousGenes.cnts' % self._prefix) overlap_genes_counts = File('%s.overlapGene.cnts' % self._prefix) summary_out = File('%s.summary.out' % self._prefix) # Arguments analyze.addArguments(sam_file, '--prefix', self._prefix) # Uses analyze.uses(sam_file, link=Link.INPUT) analyze.uses(genes_counts, link=Link.OUTPUT, transfer=True, register=False) analyze.uses(features_counts, link=Link.OUTPUT, transfer=True, register=False) analyze.uses(ambiguous_genes_counts, link=Link.OUTPUT, transfer=True, register=False) analyze.uses(overlap_genes_counts, link=Link.OUTPUT, transfer=True, register=False) analyze.uses(summary_out, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(analyze)
def _pre_filter_fastq(self, index, suffix_len): pre_filter = Job(name='pre_filter_fastq.py') pre_filter.invoke('all', self._state_update % 'Pre-filter reads file part %d' % (index + 1)) prefix = 'reads%d' % index # Inputs reads = File(('x%0' + str(suffix_len) + 'd') % index) # Outputs full_fastq = File('%s_full.fastq' % prefix) reject = File('%s_reject.fastq' % prefix) stats = File('%s.stats' % prefix) # Arguments trims = ','.join([str(i) for i in self._trims]) trims = '0' if trims == ',' else trims pre_filter.addArguments(reads, '-r', '%d' % self._read_length, '-t', '%s' % trims) pre_filter.addArguments('-p', prefix) # Uses pre_filter.uses(reads, link=Link.INPUT) for t in self._trims: fastq_t = File('%s_%d.fastq' % (prefix, t)) pre_filter.uses(fastq_t, link=Link.OUTPUT, transfer=False, register=False) pre_filter.uses(full_fastq, link=Link.OUTPUT, transfer=False, register=False) pre_filter.uses(reject, link=Link.OUTPUT, transfer=False, register=False) pre_filter.uses(stats, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(pre_filter)
def _perm_index(self, index_type, read_length, read_format='fastq', seed='F2'): perm_index = Job(name='perm') perm_index.invoke('all', self._state_update % 'Pre-computing %s index file' % index_type.capitalize()) prefix = self._get_index_hash(read_length) # Input files fa_input = File('h%s/%s.fa' % (prefix, index_type)) # Output files hash_v = self._get_index_hash(read_length, seed) index = File('h%d_%s_%s_%s.index' % (hash_v, index_type, seed, read_length)) # Arguments perm_index.addArguments(fa_input, '%d' % read_length, '--readFormat', read_format, '--seed', seed) perm_index.addArguments('-s', index) # Uses perm_index.uses(fa_input, link=Link.INPUT) # Save this file perm_index.uses(index, link=Link.OUTPUT, transfer=True, register=True) self.adag.addJob(perm_index) return perm_index
def _bar_plot(self): bar_plot = Job(name='bar_plot') bar_plot.invoke('all', self._state_update % 'Plot summary out file') # Input files summary_file = File('%s.summary.out' % self._prefix) # Output files pdf_file = File('%s.ps' % self._prefix) # Arguments bar_plot.addArguments('--output-file', pdf_file, summary_file) # Uses bar_plot.uses(summary_file, link=Link.INPUT) bar_plot.uses(pdf_file, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(bar_plot)
def _farish_compact(self): farish_compact = Job(name='farish_compact') farish_compact.invoke('all', self._state_update % 'Farish Compact') # Input files unmapped = File('%s.unmapped.fastq' % self._prefix) # Output files compact = File('%s.compact' % self._prefix) # Arguments farish_compact.addArguments(unmapped, '-o', compact) # Uses farish_compact.uses(unmapped, link=Link.INPUT) farish_compact.uses(compact, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(farish_compact)
def _clipr(self, clip_to, reads, tag): anchor = self._compute_clip_seed(self._read_length) clip_reads = Job(name='clipR') clip_reads.invoke('all', self._state_update % 'Generate new splice candidates') seed = 'F%s' % self._clip_seed mismatches = self._clip_mismatches # Input files prefix = self._get_index_hash(self._read_length) fa = File('h%s/%s.fa' % (prefix, clip_to.upper())) reads_txt = File('%s_%s_reads.txt' % (tag, clip_to.lower())) for i in self._range(): # Input files reads_i = File(reads % i) # Output files file_type = 'sam' path, file_name, ext = GTFAR._get_filename_parts(reads_i.name) sam_mapping = '%s_A_%d_%d_%d_%s.%s' % (clip_to.upper(), self._clip_seed, mismatches, anchor, file_name, file_type) fastq_out = File('%s_miss_%s%s' % (file_name, clip_to, ext)) # Uses clip_reads.uses(reads_i, link=Link.INPUT) clip_reads.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False) clip_reads.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False) # Output files log = File('%s_%s.log' % (tag, clip_to.lower())) # Arguments clip_reads.addArguments(fa, reads_txt, '--seed %s' % seed, '--anchorL %d' % anchor, '-e', '-v %d' % mismatches) clip_reads.addArguments('-s', '-u', '--noSamHeader', '--ignoreDummyR %d' % 40, '--ignoreRepeatR %d' % 15) clip_reads.setStdout(log) # Uses clip_reads.uses(fa, link=Link.INPUT) clip_reads.uses(reads_txt, link=Link.INPUT) clip_reads.uses(log, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(clip_reads)
def _perm(self, index_type, map_to, reads, tag, output_sam=False): perm = Job(name='perm') perm.invoke('all', self._state_update % 'Map reads to %s' % map_to.capitalize()) # Input files hash_v = self._get_index_hash(self._read_length, 'F%d' % self._seed) index = File('h%d_%s_F%d_%d.index' % (hash_v, map_to, self._seed, self._read_length)) reads_txt = File('%s_%s_reads.txt' % (tag, map_to.lower())) for i in self._range(): # Input files reads_i = File(reads % i) # Output files file_type = 'sam' if output_sam else 'mapping' path, file_name, ext = GTFAR._get_filename_parts(reads_i.name) sam_mapping = '%s_B_%d_%d_%s.%s' % (map_to.upper(), self._seed, self._mismatches, file_name, file_type) fastq_out = File('%s_miss_%s%s' % (file_name, map_to, ext)) # Uses perm.uses(reads_i, link=Link.INPUT) perm.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False) perm.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False) # Output files log = File('%s_%s.log' % (tag, map_to.upper())) # Arguments perm.addArguments(index, reads_txt, '--seed F%d' % self._seed, '-v %d' % self._mismatches, '-B', '--printNM') perm.addArguments('-u', '-s', '-T %d' % self._read_length) if output_sam: perm.addArguments('--noSamHeader', '--outputFormat', 'sam') perm.setStdout(log) # Uses perm.uses(index, link=Link.INPUT) perm.uses(reads_txt, link=Link.INPUT) perm.uses(log, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(perm)
def _fastq_split(self, splits=2, suffix_len=2): fastq_split = Job(name='fastq-split') fastq_split.invoke('all', self._state_update % 'Splitting input reads file into %d parts' % splits) # Inputs reads = File(self._reads) # Arguments fastq_split.addArguments(reads, '%d' % splits) # Uses fastq_split.uses(reads, link=Link.INPUT) for i in range(splits): split_i = File(('x%0' + str(suffix_len) + 'd') % i) # Outputs fastq_split.uses(split_i, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(fastq_split)
def _transcript_prediction(self): transcript_prediction = Job(name='transcript_prediction') transcript_prediction.invoke('all', self._state_update % 'Transcript Prediction') # Input files features_counts = File('%s.feature.cnts' % self._prefix) gtf = File('%s.splice_candidates.gtf' % self._prefix) # Output files transcript_counts = File('%s.transcripts.cnts' % self._prefix) # Arguments transcript_prediction.addArguments(features_counts, '-g', gtf) # Uses transcript_prediction.setStdout(transcript_counts) transcript_prediction.uses(features_counts, link=Link.INPUT) transcript_prediction.uses(gtf, link=Link.INPUT) transcript_prediction.uses(transcript_counts, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(transcript_prediction)
def _parse_clipped_alignment(self, input_file): parse_clipped_alignment = Job(name='parse_clipped_alignment') parse_clipped_alignment.invoke('all', self._state_update % 'Parse clipped alignment') # Input files input_file = File(input_file) # Output files info = File('%s.info' % input_file.name) self._info_files.append(info.name) # Arguments parse_clipped_alignment.addArguments(input_file) parse_clipped_alignment.setStdout(info) # Uses parse_clipped_alignment.uses(input_file, link=Link.INPUT) parse_clipped_alignment.uses(info, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(parse_clipped_alignment)
def _merge_stats(self): merge_stats = Job(name='merge-stats') merge_stats.invoke('all', self._state_update % 'Merging adaptor stats file') # Outputs adaptor_stats = File('%s.adaptor.stats' % self._prefix) # Arguments merge_stats.addArguments('reads*.stats', adaptor_stats) for i in range(self._splits): # Inputs stats_i = File('reads%d.stats' % i) # Uses merge_stats.uses(stats_i, link=Link.INPUT) # Outputs merge_stats.uses(adaptor_stats, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(merge_stats)
def _parse_alignment(self, input_file, tag): parse_alignment = Job(name='parse_alignment') parse_alignment.invoke('all', self._state_update % 'Parse alignment') # Input files input_file = File(input_file) # Output files vis = File('%s.vis' % input_file.name) self._vis_files.append(vis.name) # Arguments parse_alignment.addArguments(input_file, '--strandRule', self._strand_rule, '--tag', tag) parse_alignment.setStdout(vis) # Uses parse_alignment.uses(input_file, link=Link.INPUT) parse_alignment.uses(vis, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(parse_alignment)
def _merge_info(self, info_files, gtf_file): merge_info = Job(name='merge-info') merge_info.invoke('all', self._state_update % 'Merging info files to generate GTF file') # Outputs gtf_file = File(gtf_file) for info_file in info_files: # Inputs info_i = File(info_file) # Arguments merge_info.addArguments(info_i) # Uses merge_info.uses(info_i, link=Link.INPUT) # Arguments merge_info.addArguments(gtf_file) # Outputs merge_info.uses(gtf_file, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(merge_info)