Beispiel #1
0
    def _clipr(self, clip_to, reads, tag):
        anchor = self._compute_clip_seed(self._read_length)

        clip_reads = Job(name='clipR')
        clip_reads.invoke('all', self._state_update % 'Generate new splice candidates')

        seed = 'F%s' % self._clip_seed
        mismatches = self._clip_mismatches

        # Input files
        prefix = self._get_index_hash(self._read_length)
        fa = File('h%s/%s.fa' % (prefix, clip_to.upper()))
        reads_txt = File('%s_%s_reads.txt' % (tag, clip_to.lower()))

        for i in self._range():
            # Input files
            reads_i = File(reads % i)

            # Output files
            file_type = 'sam'
            path, file_name, ext = GTFAR._get_filename_parts(reads_i.name)
            sam_mapping = '%s_A_%d_%d_%d_%s.%s' % (clip_to.upper(), self._clip_seed, mismatches, anchor, file_name, file_type)
            fastq_out = File('%s_miss_%s%s' % (file_name, clip_to, ext))

            # Uses
            clip_reads.uses(reads_i, link=Link.INPUT)
            clip_reads.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False)
            clip_reads.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False)

        # Output files
        log = File('%s_%s.log' % (tag, clip_to.lower()))

        # Arguments
        clip_reads.addArguments(fa, reads_txt, '--seed %s' % seed, '--anchorL %d' % anchor, '-e', '-v %d' % mismatches)
        clip_reads.addArguments('-s', '-u', '--noSamHeader', '--ignoreDummyR %d' % 40, '--ignoreRepeatR %d' % 15)

        clip_reads.setStdout(log)

        # Uses
        clip_reads.uses(fa, link=Link.INPUT)
        clip_reads.uses(reads_txt, link=Link.INPUT)
        clip_reads.uses(log, link=Link.OUTPUT, transfer=False, register=False)

        self.adag.addJob(clip_reads)
Beispiel #2
0
    def _perm(self, index_type, map_to, reads, tag, output_sam=False):
        perm = Job(name='perm')
        perm.invoke('all', self._state_update % 'Map reads to %s' % map_to.capitalize())

        # Input files
        hash_v = self._get_index_hash(self._read_length, 'F%d' % self._seed)
        index = File('h%d_%s_F%d_%d.index' % (hash_v, map_to, self._seed, self._read_length))
        reads_txt = File('%s_%s_reads.txt' % (tag, map_to.lower()))

        for i in self._range():
            # Input files
            reads_i = File(reads % i)

            # Output files
            file_type = 'sam' if output_sam else 'mapping'
            path, file_name, ext = GTFAR._get_filename_parts(reads_i.name)
            sam_mapping = '%s_B_%d_%d_%s.%s' % (map_to.upper(), self._seed, self._mismatches, file_name, file_type)
            fastq_out = File('%s_miss_%s%s' % (file_name, map_to, ext))

            # Uses
            perm.uses(reads_i, link=Link.INPUT)
            perm.uses(fastq_out, link=Link.OUTPUT, transfer=False, register=False)
            perm.uses(sam_mapping, link=Link.OUTPUT, transfer=False, register=False)

        # Output files
        log = File('%s_%s.log' % (tag, map_to.upper()))

        # Arguments
        perm.addArguments(index, reads_txt, '--seed F%d' % self._seed, '-v %d' % self._mismatches, '-B', '--printNM')
        perm.addArguments('-u', '-s', '-T %d' % self._read_length)

        if output_sam:
            perm.addArguments('--noSamHeader', '--outputFormat', 'sam')

        perm.setStdout(log)

        # Uses
        perm.uses(index, link=Link.INPUT)
        perm.uses(reads_txt, link=Link.INPUT)
        perm.uses(log, link=Link.OUTPUT, transfer=False, register=False)

        self.adag.addJob(perm)
Beispiel #3
0
    def cat(inputs, output, o_link=Link.OUTPUT, o_transfer=False, o_register=False):
        cat = Job(name='merge')

        # Outputs
        output = File(output)

        for input_file in inputs:
            # Inputs
            input_file = File(input_file)

            # Arguments
            cat.addArguments(input_file)

            # Uses
            cat.uses(input_file, link=Link.INPUT)

        cat.setStdout(output)
        cat.uses(output, link=o_link, transfer=o_transfer, register=o_register)

        return cat
Beispiel #4
0
    def _transcript_prediction(self):
        transcript_prediction = Job(name='transcript_prediction')
        transcript_prediction.invoke('all', self._state_update % 'Transcript Prediction')

        # Input files
        features_counts = File('%s.feature.cnts' % self._prefix)
        gtf = File('%s.splice_candidates.gtf' % self._prefix)

        # Output files
        transcript_counts = File('%s.transcripts.cnts' % self._prefix)

        # Arguments
        transcript_prediction.addArguments(features_counts, '-g', gtf)

        # Uses
        transcript_prediction.setStdout(transcript_counts)
        transcript_prediction.uses(features_counts, link=Link.INPUT)
        transcript_prediction.uses(gtf, link=Link.INPUT)
        transcript_prediction.uses(transcript_counts, link=Link.OUTPUT, transfer=True, register=False)

        self.adag.addJob(transcript_prediction)
Beispiel #5
0
    def _parse_clipped_alignment(self, input_file):
        parse_clipped_alignment = Job(name='parse_clipped_alignment')
        parse_clipped_alignment.invoke('all', self._state_update % 'Parse clipped alignment')

        # Input files
        input_file = File(input_file)

        # Output files
        info = File('%s.info' % input_file.name)

        self._info_files.append(info.name)

        # Arguments
        parse_clipped_alignment.addArguments(input_file)
        parse_clipped_alignment.setStdout(info)

        # Uses
        parse_clipped_alignment.uses(input_file, link=Link.INPUT)
        parse_clipped_alignment.uses(info, link=Link.OUTPUT, transfer=False, register=False)

        self.adag.addJob(parse_clipped_alignment)
Beispiel #6
0
    def _parse_alignment(self, input_file, tag):
        parse_alignment = Job(name='parse_alignment')
        parse_alignment.invoke('all', self._state_update % 'Parse alignment')

        # Input files
        input_file = File(input_file)

        # Output files

        vis = File('%s.vis' % input_file.name)

        self._vis_files.append(vis.name)

        # Arguments
        parse_alignment.addArguments(input_file, '--strandRule', self._strand_rule, '--tag', tag)
        parse_alignment.setStdout(vis)

        # Uses
        parse_alignment.uses(input_file, link=Link.INPUT)
        parse_alignment.uses(vis, link=Link.OUTPUT, transfer=False, register=False)

        self.adag.addJob(parse_alignment)
Beispiel #7
0
    def write(self, filename, name='dax'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.
        """
        dax = ADAG(name)

        # Add files to DAX-level replica catalog.
        catalog = {}
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            f = File(attrs['lfn'])

            # Add physical file names, if any.
            urls = attrs.get('urls')
            if urls is not None:
                sites = attrs.get('sites')
                if sites is None:
                    sites = ','.join(len(urls) * ['local'])
                for url, site in zip(urls.split(','), sites.split(',')):
                    f.addPFN(PFN(url, site))

            catalog[attrs['lfn']] = f
            dax.addFile(f)

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            job = Job(name=attrs['name'], id=task_id)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('args')
            if args is not None and args:
                args = args.split()
                lfns = list(set(catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.OUTPUT)

                streams = attrs.get('streams')
                if streams is not None:
                    if streams & 1 != 0:
                        job.setStdout(f)
                    if streams & 2 != 0:
                        job.setStderr(f)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)
Beispiel #8
0
    def write_dax(self, filename='workflow.dax', name='workflow'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.

        Raises
        ------
        `ValueError`
            If either task or file node is missing mandatory attribute.
        """
        dax = ADAG(name)

        # Process file nodes.
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            try:
                name = attrs['lfn']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('lfn'))
            file_ = File(name)

            # Add physical file names, if any.
            urls = attrs.get('pfn')
            if urls is not None:
                urls = urls.split(',')
                sites = attrs.get('sites')
                if sites is None:
                    sites = len(urls) * ['condorpool']
                for url, site in zip(urls, sites):
                    file_.addPFN(PFN(url, site))

            self.catalog[attrs['lfn']] = file_

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            try:
                name = attrs['exec_name']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('exec_name'))
            label = '{name}_{id}'.format(name=name, id=task_id)
            job = Job(name, id=task_id, node_label=label)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('exec_args', [])
            if args:
                args = args.split()
                lfns = list(set(self.catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = self.catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.OUTPUT)

                    streams = attrs.get('streams')
                    if streams is not None:
                        if streams & 1 != 0:
                            job.setStdout(file_)
                        if streams & 2 != 0:
                            job.setStderr(file_)

            # Provide default files to store stderr and stdout, if not
            # specified explicitly.
            if job.stderr is None:
                file_ = File('{name}.out'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStderr(file_)
            if job.stdout is None:
                file_ = File('{name}.err'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStdout(file_)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)