def _clipr(self, clip_to, reads, tag): anchor = self._compute_clip_seed(self._read_length) clip_reads = Job(name='clipR') clip_reads.invoke('all', self._state_update % 'Generate new splice candidates') seed = 'F%s' % self._clip_seed mismatches = self._clip_mismatches # Input files prefix = self._get_index_hash(self._read_length) fa = File('h%s/%s.fa' % (prefix, clip_to.upper())) reads_txt = File('%s_%s_reads.txt' % (tag, clip_to.lower())) for i in self._range(): # Input files reads_i = File(reads % i) # Output files file_type = 'sam' path, file_name, ext = GTFAR._get_filename_parts(reads_i.name) sam_mapping = '%s_A_%d_%d_%d_%s.%s' % (clip_to.upper(), self._clip_seed, mismatches, anchor, file_name, file_type) fastq_out = File('%s_miss_%s%s' % (file_name, clip_to, ext)) # Uses clip_reads.uses(reads_i, link=Link.INPUT) clip_reads.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False) clip_reads.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False) # Output files log = File('%s_%s.log' % (tag, clip_to.lower())) # Arguments clip_reads.addArguments(fa, reads_txt, '--seed %s' % seed, '--anchorL %d' % anchor, '-e', '-v %d' % mismatches) clip_reads.addArguments('-s', '-u', '--noSamHeader', '--ignoreDummyR %d' % 40, '--ignoreRepeatR %d' % 15) clip_reads.setStdout(log) # Uses clip_reads.uses(fa, link=Link.INPUT) clip_reads.uses(reads_txt, link=Link.INPUT) clip_reads.uses(log, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(clip_reads)
def _perm(self, index_type, map_to, reads, tag, output_sam=False): perm = Job(name='perm') perm.invoke('all', self._state_update % 'Map reads to %s' % map_to.capitalize()) # Input files hash_v = self._get_index_hash(self._read_length, 'F%d' % self._seed) index = File('h%d_%s_F%d_%d.index' % (hash_v, map_to, self._seed, self._read_length)) reads_txt = File('%s_%s_reads.txt' % (tag, map_to.lower())) for i in self._range(): # Input files reads_i = File(reads % i) # Output files file_type = 'sam' if output_sam else 'mapping' path, file_name, ext = GTFAR._get_filename_parts(reads_i.name) sam_mapping = '%s_B_%d_%d_%s.%s' % (map_to.upper(), self._seed, self._mismatches, file_name, file_type) fastq_out = File('%s_miss_%s%s' % (file_name, map_to, ext)) # Uses perm.uses(reads_i, link=Link.INPUT) perm.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False) perm.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False) # Output files log = File('%s_%s.log' % (tag, map_to.upper())) # Arguments perm.addArguments(index, reads_txt, '--seed F%d' % self._seed, '-v %d' % self._mismatches, '-B', '--printNM') perm.addArguments('-u', '-s', '-T %d' % self._read_length) if output_sam: perm.addArguments('--noSamHeader', '--outputFormat', 'sam') perm.setStdout(log) # Uses perm.uses(index, link=Link.INPUT) perm.uses(reads_txt, link=Link.INPUT) perm.uses(log, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(perm)
def cat(inputs, output, o_link=Link.OUTPUT, o_transfer=False, o_register=False): cat = Job(name='merge') # Outputs output = File(output) for input_file in inputs: # Inputs input_file = File(input_file) # Arguments cat.addArguments(input_file) # Uses cat.uses(input_file, link=Link.INPUT) cat.setStdout(output) cat.uses(output, link=o_link, transfer=o_transfer, register=o_register) return cat
def _transcript_prediction(self): transcript_prediction = Job(name='transcript_prediction') transcript_prediction.invoke('all', self._state_update % 'Transcript Prediction') # Input files features_counts = File('%s.feature.cnts' % self._prefix) gtf = File('%s.splice_candidates.gtf' % self._prefix) # Output files transcript_counts = File('%s.transcripts.cnts' % self._prefix) # Arguments transcript_prediction.addArguments(features_counts, '-g', gtf) # Uses transcript_prediction.setStdout(transcript_counts) transcript_prediction.uses(features_counts, link=Link.INPUT) transcript_prediction.uses(gtf, link=Link.INPUT) transcript_prediction.uses(transcript_counts, link=Link.OUTPUT, transfer=True, register=False) self.adag.addJob(transcript_prediction)
def _parse_clipped_alignment(self, input_file): parse_clipped_alignment = Job(name='parse_clipped_alignment') parse_clipped_alignment.invoke('all', self._state_update % 'Parse clipped alignment') # Input files input_file = File(input_file) # Output files info = File('%s.info' % input_file.name) self._info_files.append(info.name) # Arguments parse_clipped_alignment.addArguments(input_file) parse_clipped_alignment.setStdout(info) # Uses parse_clipped_alignment.uses(input_file, link=Link.INPUT) parse_clipped_alignment.uses(info, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(parse_clipped_alignment)
def _parse_alignment(self, input_file, tag): parse_alignment = Job(name='parse_alignment') parse_alignment.invoke('all', self._state_update % 'Parse alignment') # Input files input_file = File(input_file) # Output files vis = File('%s.vis' % input_file.name) self._vis_files.append(vis.name) # Arguments parse_alignment.addArguments(input_file, '--strandRule', self._strand_rule, '--tag', tag) parse_alignment.setStdout(vis) # Uses parse_alignment.uses(input_file, link=Link.INPUT) parse_alignment.uses(vis, link=Link.OUTPUT, transfer=False, register=False) self.adag.addJob(parse_alignment)
def write(self, filename, name='dax'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. """ dax = ADAG(name) # Add files to DAX-level replica catalog. catalog = {} for file_id in self.files: attrs = self.graph.node[file_id] f = File(attrs['lfn']) # Add physical file names, if any. urls = attrs.get('urls') if urls is not None: sites = attrs.get('sites') if sites is None: sites = ','.join(len(urls) * ['local']) for url, site in zip(urls.split(','), sites.split(',')): f.addPFN(PFN(url, site)) catalog[attrs['lfn']] = f dax.addFile(f) # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] job = Job(name=attrs['name'], id=task_id) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('args') if args is not None and args: args = args.split() lfns = list(set(catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(f) if streams & 2 != 0: job.setStderr(f) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)
def write_dax(self, filename='workflow.dax', name='workflow'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. Raises ------ `ValueError` If either task or file node is missing mandatory attribute. """ dax = ADAG(name) # Process file nodes. for file_id in self.files: attrs = self.graph.node[file_id] try: name = attrs['lfn'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('lfn')) file_ = File(name) # Add physical file names, if any. urls = attrs.get('pfn') if urls is not None: urls = urls.split(',') sites = attrs.get('sites') if sites is None: sites = len(urls) * ['condorpool'] for url, site in zip(urls, sites): file_.addPFN(PFN(url, site)) self.catalog[attrs['lfn']] = file_ # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] try: name = attrs['exec_name'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('exec_name')) label = '{name}_{id}'.format(name=name, id=task_id) job = Job(name, id=task_id, node_label=label) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('exec_args', []) if args: args = args.split() lfns = list(set(self.catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = self.catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(file_) if streams & 2 != 0: job.setStderr(file_) # Provide default files to store stderr and stdout, if not # specified explicitly. if job.stderr is None: file_ = File('{name}.out'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStderr(file_) if job.stdout is None: file_ = File('{name}.err'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStdout(file_) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)