Beispiel #1
0
Datei: s2c.py Projekt: yigbt/uap
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id]['in/alignments']
                # check, if only a single input file is provided
                if len(input_paths) != 1:
                    raise Exception(
                        "Expected exactly one alignments file., but got this %s" %
                        input_paths)

                if self.is_option_set_in_config('tmp_dir'):
                    if not os.path.isdir(self.get_option('tmp_dir')):
                        # dir not present
                        raise StepError(
                            self, "Directory %s not found" %
                            self.get_option('tmp_dir'))
                    if not os.access(self.get_option('tmp_dir'), os.W_OK):
                        #not accessible
                        raise StepError(
                            self, "Directory %s not accessible." %
                            self.get_option('tmp_dir'))

                alignments_path = input_paths[0]
                cat = [self.get_tool('cat'), alignments_path]
#                pigz = [self.get_tool('pigz'), '--decompress', '--processes', '1', '--stdout']
                pigz = [self.get_tool('pigz'),
                        '--decompress',
                        '--processes',
                        str(self.get_cores()),
                        '--stdout']
                s2c = [
                    self.get_tool('s2c'),
                    '-s',
                    '/dev/stdin',
                    '-o',
                    self.get_option('tmp_dir')]
                if self.is_option_set_in_config('maxDist'):
                    s2c.extend(['-d', str(self.get_option('maxDist'))])

                # schreibt .sam nach stdout
                fix_s2c = [self.get_tool('fix_s2c')]
#                pigz2 = [self.get_tool('pigz'), '--processes', '2', '--stdout']
                pigz2 = [self.get_tool('pigz'),
                         '--processes',
                         str(self.get_cores()),
                         '--stdout']

                with run.new_exec_group() as exec_group:
                    with exec_group.add_pipeline() as s2c_pipe:
                        s2c_pipe.add_command(cat)
                        s2c_pipe.add_command(pigz)
                        s2c_pipe.add_command(s2c)
                        s2c_pipe.add_command(fix_s2c)
                        s2c_pipe.add_command(
                            pigz2, stdout_path=run.add_output_file(
                                'alignments', '%s-cufflinks-compatible.sam.gz' %
                                run_id, input_paths))
Beispiel #2
0
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():
            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                # Add empty out connection if we have an empty in connection
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                    run.add_empty_output_connection("indices")
                # Fail if we haven't exactly one input file
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                # Fail if the input is not a bam file
                elif os.path.splitext(input_paths[0])[1] not in ['.bam']:
                    raise StepError(
                        self, "The file %s seems not to be a BAM file. At "
                        "least the suffix is wrong." % input_paths[0])
                # Everything seems fine, lets start
                else:
                    input_bam = input_paths[0]
                    base = os.path.basename(input_bam)
                    # At first create the index and a symlink to original BAM
                    with run.new_exec_group() as link_exgr:
                        # 1. command: Create symbolic link to original bam file
                        # (use absolute path)
                        ln = [self.get_tool('ln'), '-s', input_bam]
                        bam_link = run.add_output_file('alignments', base,
                                                       input_paths)
                        ln.append(bam_link)

                        link_exgr.add_command(ln)
                    with run.new_exec_group() as index_exgr:
                        # 2. command: Index bam file
                        samtools_index = [self.get_tool('samtools'), 'index']
                        if self.get_option('index_type') == 'bai':
                            samtools_index.append('-b')
                            run.add_output_file('indices', '%s.bai' % base,
                                                input_paths)
                        elif self.get_option('index_tpye') == 'csi':
                            samtools_index.append('-c')
                            run.add_output_file('indices', '%s.csi' % base,
                                                input_paths)
                        samtools_index.append(bam_link)
                        index_exgr.add_command(samtools_index)
                    # Calculate samtools idxstats
                    with run.new_exec_group() as idxstats_exgr:
                        samtools_idxstats = [
                            self.get_tool('samtools'), 'idxstats'
                        ]
                        samtools_idxstats.append(bam_link)
                        idxstats_exgr.add_command(
                            samtools_idxstats,
                            stdout_path=run.add_output_file(
                                'index_stats', '%s_idxstats.txt' % base,
                                input_paths))
Beispiel #3
0
    def runs(self, run_ids_connections_files):
        # Check if chromosome sizes points to a real file
        if not os.path.isfile(self.get_option('chromosome-sizes')):
            raise StepError(
                self, "Value for option 'chromosome-sizes' is not a "
                "file: %s" % self.get_option('chromosome-sizes'))
        if self.get_option('temp-sort-dir') and \
           not os.path.isdir(self.get_option('temp-sort-dir')):
            raise StepError(
                self, "Value for option 'temp-sort-dir' is not a "
                "directory: %s" % self.get_option('temp-sort-dir'))
        for run_id in run_ids_connections_files.keys():
            with self.declare_run(run_id) as run:
                # Collect input paths
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                # Handle special condition e.g. no input files
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                # Complain if necessary
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")

                root, ext = os.path.splitext(os.path.basename(input_paths[0]))
                # Complain if necessary
                if not ext == '.bam':
                    raise StepError(
                        self, "The file %s does not appear to be any "
                        "of bam.gz, bam.gzip, or bam" % input_paths[0])

                bedgraph_file = run.add_output_file('bedgraph',
                                                    '%s.bg' % run_id,
                                                    input_paths)
                bigwig_file = run.add_output_file('bigwig', '%s.bw' % run_id,
                                                  input_paths)
                # Start creation of BedGraph files
                with run.new_exec_group() as bedgraph_group:
                    with bedgraph_group.add_pipeline() as pipe:
                        # BAM -> BedGraph
                        # (necessary for bedGraph, bigWig)
                        genomecov = [self.get_tool('bedtools'), 'genomecov']
                        genomecov.append('-bg')
                        genomecov.append('-ibam')
                        genomecov.extend(input_paths)

                        pipe.add_command(genomecov)

                        sort = [self.get_tool('sort'), '-k1,1', '-k2,2n']
                        pipe.add_command(sort, stdout_path=bedgraph_file)
                with run.new_exec_group() as bigwig_group:
                    bedgraph_to_bigwig = [
                        self.get_tool('bedGraphToBigWig'), bedgraph_file,
                        self.get_option('chromosome-sizes'), bigwig_file
                    ]
                    bigwig_group.add_command(bedgraph_to_bigwig)
Beispiel #4
0
    def runs(self, run_ids_connections_files):
        run_id_sheme = self.get_option('name_sheme')
        prefix = self.get_option('prefix')
        if prefix:
            run_id_sheme = '%s_%%s_R1' % prefix
            logger.warning("[%s] The 'prefix' option is deprecaded in favor "
                           "of the 'name_sheme' option. The set pefix '%s' is "
                           "converted to 'name_sheme: %s'" %
                           (self, prefix, run_id_sheme))
        try:
            _ = run_id_sheme % ''
        except TypeError as e:
            raise StepError(
                self,
                'Could not parse name_sheme "%s": %s' % (run_id_sheme, e))
        for run_id in run_ids_connections_files.keys():
            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id]['in/fastx']
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                else:
                    is_gzipped = True if os.path.splitext(input_paths[0])[1]\
                        in ['.gz', '.gzip'] else False

                out = run.add_output_file("fastx",
                                          run_id_sheme % run_id + '.fastq.gz',
                                          input_paths)

                with run.new_exec_group() as exec_group:
                    with exec_group.add_pipeline() as pipe:
                        # 1.1 command: Uncompress file
                        if is_gzipped:
                            pigz = [
                                self.get_tool('pigz'), '--decompress',
                                '--processes', '1', '--stdout'
                            ]
                            pigz.extend(input_paths)
                            pipe.add_command(pigz)
                        else:
                            cat = [self.get_tool('cat')]
                            cat.extend(input_paths)
                            pipe.add_command(cat)

                        # 1. Run  fastx  for input file
                        fastx_revcom = [
                            self.get_tool('fastx_reverse_complement')
                        ]
                        # gzip
                        fastx_revcom.extend(['-z'])
                        pipe.add_command(fastx_revcom, stdout_path=out)
    def runs(self, run_ids_connections_files):
        options = [
            'max_width', 'bin_size', 'extrap', 'step', 'bootstraps', 'cval',
            'terms', 'quick'
        ]

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('-%s' % option)
            else:
                option_list.append('-%s' % option)
                option_list.append(str(self.get_option(option)))

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                is_bam = True if os.path.splitext(input_paths[0])[1]\
                    in ['.bam'] else False
                is_bed = True if os.path.splitext(input_paths[0])[1]\
                    in ['.bed'] else False

                if input_paths == [None]:
                    run.add_empty_output_connection("complexity_curve")
                    run.add_empty_output_connection("future_yield")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                elif not is_bam and not is_bed:
                    raise StepError(
                        self, "Input file %s is niether BAM nor BED." %
                        input_paths[0])
                else:
                    with run.new_exec_group() as gc_group:
                        gc_extrap_out = run.add_output_file(
                            'future_genome_coverage',
                            '%s_future_genome_coverage.txt' % run_id,
                            input_paths)
                        gc_extrap = [self.get_tool('preseq'), 'gc_extrap']
                        gc_extrap.extend(option_list)
                        if is_bed:
                            gc_extrap.append('-bed')
                        gc_extrap.extend(['-o', gc_extrap_out, input_paths[0]])
                        gc_group.add_command(gc_extrap)
Beispiel #6
0
    def declare_runs(self):
        regex = re.compile(self.get_option('group'))
        found_files = dict()

        # find files
        for path in glob.glob(os.path.abspath(self.get_option('pattern'))):
            match = regex.match(os.path.basename(path))
            if match is None:
                raise StepError(
                    self, "Couldn't match regex /%s/ to file %s." %
                    (self.get_option('group'), os.path.basename(path)))

            sample_id_parts = []
            if self.is_option_set_in_config('sample_id_prefix'):
                sample_id_parts.append(self.get_option('sample_id_prefix'))

            sample_id_parts += list(match.groups())
            sample_id = '_'.join(sample_id_parts)
            if sample_id not in found_files:
                found_files[sample_id] = list()
            found_files[sample_id].append(path)

        # declare a run for every sample
        for run_id, paths in found_files.items():
            with self.declare_run(run_id) as run:
                run.add_public_info("paired_end",
                                    self.get_option("paired_end"))
                for path in paths:
                    run.add_output_file("raws", path, [])
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                else:
                    is_gzipped = True if os.path.splitext(input_paths[0])[1]\
                        in ['.gz', '.gzip'] else False

                with run.new_exec_group() as exec_group:

                    with exec_group.add_pipeline() as pipe:
                        # 1. command: Read file in 4MB chunks
                        dd_in = [
                            self.get_tool('dd'), 'ibs=2M',
                            'if=%s' % input_paths[0]
                        ]
                        pipe.add_command(dd_in)

                        # 1.1 command: Uncompress file to fifo
                        if is_gzipped:
                            pigz = [
                                self.get_tool('pigz'), '--decompress',
                                '--processes',
                                str(self.get_cores()), '--stdout'
                            ]
                            pipe.add_command(pigz)

                        # 1.2 call samtools to handle also .bam files
                        samtools_view = [
                            self.get_tool('samtools'), 'view', '-h', '-'
                        ]
                        pipe.add_command(samtools_view)

                        # 2. command: Process sam file
                        # create the names of the out connections
                        logfile = run.add_output_file(
                            'log', '%s.discarded.sam' % run_id, input_paths)
                        statsfile = run.add_output_file(
                            'stats', '%s.statistics.txt' % run_id, input_paths)
                        outfile = run.add_output_file(
                            'alignments', '%s.reduced.sam' % run_id,
                            input_paths)
                        # construct cmd
                        discard_cmd = [
                            self.get_tool('discardLargeSplitsAndPairs'),
                            '--N_splits',
                            self.get_option('N_splits'), '--M_mates',
                            self.get_option('M_mates'), '--statsfile',
                            statsfile, '--logfile', logfile, '-', outfile
                        ]
                        # execute cmd
                        pipe.add_command(discard_cmd)
Beispiel #8
0
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():
            # Get input alignments
            input_paths = run_ids_connections_files[run_id]['in/alignments']

            if input_paths == [None]:
                run.add_empty_output_connection("alignments")
            elif len(input_paths) != 1:
                raise StepError(self, "Expected exactly one alignments file.")

            with self.declare_run(run_id) as run:
                for input_path in input_paths:
                    basename = os.path.splitext(
                        os.path.basename(input_path))[0]

                    with run.new_exec_group().add_pipeline() as pipe:
                        # Read input alignments
                        dd = [
                            self.get_tool('dd'),
                            'ibs=%s' % self.get_option('dd-blocksize'),
                            'if=%s' % input_path
                        ]
                        pipe.add_command(dd)
                        # Assemble samtools stats command
                        samtools = [self.get_tool('samtools'), 'stats']
                        pipe.add_command(samtools,
                                         stdout_path=run.add_output_file(
                                             'stats', basename + '.bam.stats',
                                             input_path))
Beispiel #9
0
    def runs(self, run_ids_connections_files):
        options = ['step', 'verbose', 'pe', 'hist', 'vals', 'seg_len']

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('-%s' % option)
            else:
                option_list.append('-%s' % option)
                option_list.append(str(self.get_option(option)))

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                is_bam = True if os.path.splitext(input_paths[0])[1]\
                    in ['.bam'] else False
                is_bed = True if os.path.splitext(input_paths[0])[1]\
                    in ['.bed'] else False

                if input_paths == [None]:
                    run.add_empty_output_connection("complexity_curve")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                elif not is_bam and not is_bed:
                    raise StepError(
                        self, "Input file %s is niether BAM nor BED." %
                        input_paths[0])
                else:
                    with run.new_exec_group() as cc_group:
                        c_curve_out = run.add_output_file(
                            'complexity_curve',
                            '%s_complexity_output.txt' % run_id, input_paths)
                        c_curve = [self.get_tool('preseq'), 'c_curve']
                        c_curve.extend(option_list)
                        if is_bam:
                            c_curve.append('-bam')
                        c_curve.extend(['-o', c_curve_out, input_paths[0]])
                        cc_group.add_command(c_curve)
Beispiel #10
0
    def runs(self, run_ids_connections_files):
        # found_files holds the runIDs and their related files
        found_files = dict()

        if self.is_option_set_in_config('group') and \
           self.is_option_set_in_config('pattern'):
            regex = re.compile(self.get_option('group'))

            # find files matching the 'group' pattern in all files matching
            # 'pattern'
            for path in glob.glob(os.path.abspath(self.get_option('pattern'))):
                match = regex.match(os.path.basename(path))
                if match is None:
                    raise StepError(self, "Couldn't match regex /%s/ to file %s."
                                   % (self.get_option('group'),
                                      os.path.basename(path)))

                sample_id_parts = []
                if self.is_option_set_in_config('sample_id_prefix'):
                    sample_id_parts.append(self.get_option('sample_id_prefix'))

                sample_id_parts += list(match.groups())
                sample_id = '_'.join(sample_id_parts)
                if sample_id not in found_files:
                    found_files[sample_id] = list()
                found_files[sample_id].append(path)

        elif self.is_option_set_in_config('sample_to_files_map'):
            for run_id, paths in self.get_option(
                    'sample_to_files_map').items():
                for path in paths:
                    if not os.path.isfile(path):
                        raise StepError(self, "[raw_file_source]: %s is no file. "
                                       "Please provide correct path." % path)
                if run_id not in found_files:
                    found_files[run_id] = list()
                found_files[run_id] = paths

        else:
            raise StepError(self,
                "[raw_file_source]: Either 'group' AND 'pattern'"
                " OR 'sample_to_files_map' options have to be set. ")
        # declare a run for every sample
        for run_id, paths in found_files.items():
            with self.declare_run(run_id) as run:
                for path in paths:
                    run.add_output_file("raw", path, [])
Beispiel #11
0
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                else:
                    is_gzipped = True if os.path.splitext(input_paths[0])[1]\
                        in ['.gz', '.gzip'] else False

                out = run.add_output_file(
                    "report_rRNA", "%s_%s-rRNA_count.txt" % (run_id, 'R1'),
                    input_paths)

                samtools = [self.get_tool('samtools'), 'view', '-S']

                with run.new_exec_group() as exec_group:
                    with exec_group.add_pipeline() as pipe:
                        # 1.1 command: Uncompress file to no f*****g fifo
                        if is_gzipped:
                            pigz = [
                                self.get_tool('pigz'), '--decompress',
                                '--processes', '1', '--stdout'
                            ]
                            pigz.extend(input_paths)
                            pipe.add_command(pigz)

                            # 2. command: Convert to fastq
                            samtools.append('-')
                        else:
                            samtools.extend(input_paths)
                        pipe.add_command(samtools)

                        # 3 save fastq file
                        cuta = [self.get_tool('cut'), '-f', '2,3,4']
                        pipe.add_command(cuta)
                        cutb = [self.get_tool('cut'), '-f', '1', '-d', '|']
                        pipe.add_command(cutb)
                        grep = [self.get_tool('grep'), '-v', '*']
                        pipe.add_command(grep)

                        cutc = [self.get_tool('cut'), '-f', '1', '-d', '_']
                        pipe.add_command(cutc)

                        sorta = [self.get_tool('sort')]

                        pipe.add_command(sorta)

                        uniq = [self.get_tool('uniq'), '-c']
                        pipe.add_command(uniq)
                        sortb = [self.get_tool('sort')]
                        pipe.add_command(sortb, stdout_path=out)
Beispiel #12
0
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    "in/alignments"]
                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                else:
                    is_gzipped = True if os.path.splitext(input_paths[0])[1]\
                        in ['.gz', '.gzip'] else False

                out = run.add_output_file(
                    "first_read", "%s_%s-samto.fastq.gz" % (run_id, 'R1'),
                    input_paths)

                with run.new_exec_group() as exec_group:
                    with exec_group.add_pipeline() as pipe:
                        # 1.1 command: Uncompress file to no f*****g fifo
                        if is_gzipped:
                            pigz = [
                                self.get_tool('pigz'), '--decompress',
                                '--processes', '1', '--stdout'
                            ]
                            pigz.extend(input_paths)
                            pipe.add_command(pigz)

                            # 2. command: Convert to fastq
                            samtools = [self.get_tool('samtools'), 'fastq']

                            if self.is_option_set_in_config('f'):
                                samtools.extend(
                                    ['-f', str(self.get_option('f'))])

                            if self.is_option_set_in_config('F'):
                                samtools.extend(
                                    ['-F', str(self.get_option('F'))])

                            if self.is_option_set_in_config('addF'):
                                samtools.extend(
                                    ['-F', str(self.get_option('addF'))])

                            samtools.append('-')
                            pipe.add_command(samtools)

                            # 3 save fastq file

                            pigzc = [
                                self.get_tool('pigz'), '--processes', '2',
                                '--fast', '-'
                            ]

                            pipe.add_command(pigzc, stdout_path=out)
Beispiel #13
0
    def runs(self, run_ids_connections_files):

        for run_id in run_ids_connections_files.keys():
            # Get the basename
            index_basename = "%s-%s" % (
                self.get_option('index-basename'), run_id)

            with self.declare_run(index_basename) as run:
                with run.new_exec_group() as exec_group:
                    refseq = run_ids_connections_files[run_id]['in/reference_sequence']

                    if refseq == [None]:
                        raise StepError(
                            self, "No reference sequence received.")
                    if len(refseq) != 1:
                        raise StepError(
                            self, "Reference sequence is not a single file.")
                    bwa_index = [self.get_tool('bwa'), 'index']
                    # Add index_basename
                    bwa_index.extend(['-p', index_basename])
                    # Add reference sequence (a single file)
                    bwa_index.append(refseq[0])
                    exec_group.add_command(bwa_index)

                    run.add_output_file(
                        'bwa_index',
                        '%s.amb' % index_basename,
                        refseq)
                    run.add_output_file(
                        'bwa_index',
                        '%s.ann' % index_basename,
                        refseq)
                    run.add_output_file(
                        'bwa_index',
                        '%s.bwt' % index_basename,
                        refseq)
                    run.add_output_file(
                        'bwa_index',
                        '%s.pac' % index_basename,
                        refseq)
                    run.add_output_file(
                        'bwa_index',
                        '%s.sa' % index_basename,
                        refseq)
Beispiel #14
0
    def _getFastFormat(self, fast_file, is_gzipped):

        required_file_extensions = [
            '.fastq', '.fq', 'fnq', '.fasta', '.fa', '.fna'
        ]

        example_file = os.path.basename(fast_file)
        format_index = -2 if is_gzipped else -1
        fast_format = '.' + example_file.split('.')[format_index]

        if fast_format not in required_file_extensions:
            raise StepError(
                self, "File %s does not end with any "
                "expected suffix (%s). Please fix that issue." %
                (fast_file, ' | '.join(required_file_extensions)))

        fast_char = fast_format[-1]
        return fast_char
Beispiel #15
0
    def runs(self, run_ids_connections_files):
        subcommand = self.get_option('subcommand')
        for run_id in run_ids_connections_files.keys():
            # Collect input_paths and labels for multiBamSummary
            input_paths = run_ids_connections_files[run_id]['in/alignments']
            labels = list()
            for f in input_paths:
                if not f.endswith(".bam"):
                    raise StepError(self, "Not a BAM file: %s" % f)
                if len(input_paths) > 1:
                    labels.append("%s-%s" % (run_id, input_paths.index(f)))
                else:
                    labels.append(run_id)

            with self.declare_run(run_id) as run:
                # Let's compile the command
                with run.new_exec_group() as multi_bam_summary_eg:
                    # 1. multiBamSummary command
                    multi_bam_summary = [
                        self.get_tool('multiBamSummary'), subcommand
                    ]
                    # Append list of input BAM files
                    multi_bam_summary.append('--bamfiles')
                    multi_bam_summary.extend(input_paths)
                    # Append name of the output file
                    multi_bam_summary.append('--outFileName')
                    multi_bam_summary.append(
                        run.add_output_file('read-coverage', '%s.npz' % run_id,
                                            input_paths))
                    # Append list of BED files for BED-file subcommand
                    if subcommand == "BED-file":
                        multi_bam_summary.append('--BED')
                        multi_bam_summary.extend(self.get_option('bed-file'))
                    # Append list of labels
                    multi_bam_summary.append('--labels')
                    multi_bam_summary.extend(labels)
                    # Append number of processors
                    multi_bam_summary.extend(
                        ['--numberOfProcessors',
                         str(self.get_cores())])

                    # Add multiBamSummary to execution group
                    multi_bam_summary_eg.add_command(multi_bam_summary)
Beispiel #16
0
    def runs(self, run_ids_connections_files):
        self.set_cores(self.get_option('cores'))

        annotation = None
        for run_id in run_ids_connections_files.keys():
            if 'in/annotation' in run_ids_connections_files[run_id]:
                annotation = run_ids_connections_files[run_id][
                    'in/annotation'][0]

        for run_id in run_ids_connections_files.keys():
            if 'in/annotation' in run_ids_connections_files[run_id]:
                continue

            with self.declare_run(run_id) as run:
                counts = run_ids_connections_files[run_id]['in/counts'][0]

                tool_name = self.get_option('t')
                file_name = '%s-gene-abundance.tsv' % (run_id)
                run.add_output_file('counts', file_name, [counts])

                cmd = [self.get_tool('tcount2gcount')]

                if self.is_option_set_in_config('m'):
                    cmd.extend(['-m', os.path.abspath(self.get_option('m'))])
                else:
                    if annotation:
                        cmd.extend(['-m', os.path.abspath(annotation)])
                    else:
                        raise StepError(
                            self,
                            "%s no annotation give via config or connection" %
                            run_id)

                if self.is_option_set_in_config('kallisto-extended'):
                    cmd.append('--kallisto-extended')

                cmd.extend(['-i', counts, '-t', tool_name, '-o', file_name])

                convert_exec_group = run.new_exec_group()
                convert_exec_group.add_command(cmd)
Beispiel #17
0
    def runs(self, cc):
        # get a list of all read files we have to count
        sample_input_paths_dict = dict()
        reads_counts_files = dict()
        read_files = list()

        options = {'new_output_format': '-N', 'quality': '-Q'}
        option_list = list()
        for option in [
                o for o in options.keys() if self.is_option_set_in_config(o)
        ]:
            if isinstance(self.get_option(option), bool) and \
               self.get_option(option):
                option_list.append(options[option])
            else:
                option_list.append(options[option])
                option_list.append(str(self.get_option(option)))

        read_types = {'first_read': '_R1', 'second_read': '_R2'}
        for run_id in cc.keys():
            cc.switch_run_id(run_id)
            with self.declare_run(run_id) as run:
                for read in read_types:
                    if not cc.exists_connection_for_run(f"in/{read}"):
                        continue
                    connection = 'in/%s' % read
                    input_paths = cc[run_id][connection]

                    # Check for empty connections
                    if input_paths == [None]:
                        run.add_empty_output_connection("%s_quality_stats" %
                                                        read)
                    else:
                        temp_fifos = list()
                        exec_group = run.new_exec_group()
                        for input_path in input_paths:
                            temp_fifo = run.add_temporary_file(
                                "fifo-%s" % os.path.basename(input_path))
                            temp_fifos.append(temp_fifo)
                            mkfifo = [self.get_tool('mkfifo'), temp_fifo]
                            exec_group.add_command(mkfifo)

                            # 2. Output files to fifo
                            if input_path.endswith('fastq.gz'):
                                with exec_group.add_pipeline() as unzip_pipe:
                                    # 2.1 command: Read file in 'dd-blocksize'
                                    # chunks
                                    dd_in = [
                                        self.get_tool('dd'),
                                        'ibs=%s' %
                                        self.get_option('dd-blocksize'),
                                        'if=%s' % input_path
                                    ]
                                    # 2.2 command: Uncompress file to fifo
                                    pigz = [
                                        self.get_tool('pigz'), '--decompress',
                                        '--processes',
                                        str(self.get_cores()), '--blocksize',
                                        self.get_option('pigz-blocksize'),
                                        '--stdout'
                                    ]
                                    # 2.3 Write file in 'dd-blocksize' chunks
                                    # to fifo
                                    dd_out = [
                                        self.get_tool('dd'),
                                        'obs=%s' %
                                        self.get_option('dd-blocksize'),
                                        'of=%s' % temp_fifo
                                    ]

                                    unzip_pipe.add_command(dd_in)
                                    unzip_pipe.add_command(pigz)
                                    unzip_pipe.add_command(dd_out)
                            elif input_path.endswith('fastq'):
                                # 2.1 command: Read file in 'dd-blocksize' chunks and
                                # write to fifo in 'dd-blocksize' chunks
                                dd_in = [
                                    self.get_tool('dd'),
                                    'bs=%s' % self.get_option('dd-blocksize'),
                                    'if=%s' % input_path,
                                    'of=%s' % temp_fifo
                                ]
                                exec_group.add_command(dd_in)
                            else:
                                raise StepError(
                                    self, "File %s does not end with any "
                                    "expected suffix (fastq.gz or "
                                    "fastq). Please fix that issue.")
                        # 3. Read data from fifos and check quality stats
                        with exec_group.add_pipeline() as fastx_pipe:
                            # 3.1 command: Read from ALL fifos
                            cat = [self.get_tool('cat')]
                            cat.extend(temp_fifos)
                            # 3.2 command: Compute quality statistics
                            fastx_qs_file = run.add_output_file(
                                "%s_quality_stats" % read,
                                "%s%s.fastq.quality.tsv" %
                                (run_id, read_types[read]), input_paths)
                            fastx_qs = [self.get_tool('fastx_quality_stats')]
                            fastx_qs.extend(option_list)
                            fastx_pipe.add_command(cat)
                            fastx_pipe.add_command(fastx_qs,
                                                   stdout_path=fastx_qs_file)
Beispiel #18
0
    def runs(self, cc):
        flags = [
            'q', 'qseq', 'skip', 'f', 'c', 'ignore-quals', 'nofw', 'dta',
            'norc', 'no-mixed', 'no-discordant', 'quiet', 'qc-filter',
            'non-deterministic', 'no-temp-splicesite', 'no-softclip',
            'no-spliced-alignment', 'tmo', 'no-head', 'no-sq', 'omit-sec-seq',
            'remove-chrname', 'add-chrname', 'new-summary'
        ]

        strflags = [
            'n-ceil', 'ma', 'mp', 'sp', 'np', 'rdg', 'score-min', 'k', 'skip',
            'rfg', 'rg', 'pen-cansplice', 'pen-noncansplice',
            'pen-canintronlen', 'pen-noncanintronlen', 'min-intronlen',
            'max-intronlen', 'known-splicesite-infile', 'minins', 'maxins',
            'seed', 'trim5', 'trim3', 'novel-splicesite-outfile',
            'novel-splicesite-infile'
        ]

        self.set_cores(self.get_option('cores'))

        # Check if option values are valid
        if not os.path.exists(self.get_option('index') + '.1.ht2'):
            raise StepError(
                self,
                "Could not find index file: %s.*" % self.get_option('index'))

        paired_end = cc.connection_exists('in/second_read')

        if not cc.all_runs_have_connection('in/first_read'):
            read_name = '' if paired_end else ' first'
            run_ids = list(cc.get_runs_without_any('in/first_read'))
            if len(run_ids) > 5:
                run_ids = run_ids[0:5] + ['...']
            raise StepError(
                self, 'No%s read passed by runs '
                '%s.' % (read_name, list(run_ids)))

        if paired_end and not cc.all_runs_have_connection('in/second_read'):
            run_ids = list(cc.get_runs_without_any('in/second_read'))
            if len(run_ids) > 5:
                run_ids = run_ids[0:5] + ['...']
            raise StepError(self, 'No second read passed by runs '
                            '%s.' % run_ids)

        res = [(self.get_option('fr'), 'fr'), (self.get_option('rf'), 'rf'),
               (self.get_option('ff'), 'ff')]
        library_types = [flag for is_set, flag in res if is_set]
        if len(library_types) > 1:
            message = "too many stranded flags fr, rf, ff: %s"
            raise Exception(message % (res))

        library_type = self.get_option('library_type')

        if paired_end and library_types and library_type:
            raise StepError(
                self, 'Option "library_type: %s" and the flag %s '
                'are set. Please specify only one.' %
                (library_type, library_types[0]))
        elif paired_end and library_types:
            library_type = library_types[0]
        elif not paired_end and library_types:
            raise StepError(
                self, 'Library type %s is specified for single '
                'end reads.' % library_types[0])
        elif not paired_end and library_type:
            raise StepError(
                self, 'Library type %s is specified for single '
                'end reads.' % library_type)

        for run_id in cc.keys():
            with self.declare_run(run_id) as run:
                # Get list of files for first/second read
                fr_input = cc[run_id]['in/first_read'][0]
                input_paths = [fr_input]
                is_paired_end = False
                if paired_end:
                    sr_input = cc[run_id]['in/second_read'][0]
                    input_paths.append(sr_input)

                with run.new_exec_group() as exec_group:
                    with exec_group.add_pipeline() as hisat2_pipe:
                        # Assemble hisat2 command
                        hisat2 = [self.get_tool('hisat2')]

                        for flag in flags:
                            if self.get_option(flag) is True:
                                if flag in ['q', 'f', 'r', 'c']:
                                    hisat2.extend(['-' + flag])
                                else:
                                    hisat2.extend(['--' + flag])

                        if paired_end:
                            hisat2.append('--%s' % library_type)

                        for flag in strflags:
                            if self.is_option_set_in_config(flag):
                                hisat2.extend(
                                    ['--' + flag,
                                     str(self.get_option(flag))])

                        # Leave 2 cores available for pigz compressing the
                        # output.
                        hisat2.extend([
                            '-p',
                            str(self.get_option('cores') - 2), '-x',
                            os.path.abspath(self.get_option('index'))
                        ])

                        if paired_end:
                            if self.get_option('rna-strandness') == 'F':
                                hisat2.extend(['--rna-strandness', 'FR'])
                            elif self.get_option('rna-strandness') == 'R':
                                hisat2.extend(['--rna-strandness', 'RF'])

                            hisat2.extend(['-1', fr_input, '-2', sr_input])
                        else:
                            hisat2.extend(['-U', fr_input])
                            if not self.get_option('rna-strandness') == 'U':
                                hisat2.extend([
                                    '--rna-strandness',
                                    self.get_option('rna-strandness')
                                ])

                        log_stderr = run.add_output_file(
                            'log_stderr', '%s-hisat2-log_stderr.txt' % run_id,
                            input_paths)

                        summary = run.add_output_file(
                            'summary', '%s-hisat2-summary.txt' % run_id,
                            input_paths)
                        hisat2.extend(['--summary-file', summary])

                        metrics = run.add_output_file(
                            'metrics', '%s-hisat2-metrics.txt' % run_id,
                            input_paths)
                        hisat2.extend(['--met-file', metrics])

                        if self.get_option('un-gz') is True:
                            unaligned = run.add_output_file(
                                'unaligned',
                                '%s-hisat2-unaligned.fastq.gz' % run_id,
                                input_paths)
                            hisat2.extend(['--un-gz', unaligned])

                        if self.get_option('al-gz') is True:
                            aligned = run.add_output_file(
                                'aligned',
                                '%s-hisat2-aligned.fastq.gz' % run_id,
                                input_paths)
                            hisat2.extend(['--al-gz', aligned])

                        hisat2_pipe.add_command(hisat2, stderr_path=log_stderr)
                        res = run.add_output_file(
                            'alignments', '%s-hisat2-results.sam.gz' % run_id,
                            input_paths)

                        # Compress hisat2 output
                        pigz = [self.get_tool('pigz'), '--stdout']
                        hisat2_pipe.add_command(pigz, stdout_path=res)
Beispiel #19
0
    def runs(self, cc):
        self.set_cores(self.get_option('cores'))

        if self.get_option('config') and self.get_option('databases'):
            raise StepError(self, "A config file and databases are specified.")

        if not self.get_option('config') and not self.get_option('databases'):
            raise StepError(self, "No config file or databases are specified.")

        if self.get_option('config'):
            logger.warning(
                '[%s] Using a config file is deprecated. '
                'Please specify databases instead.' %
                self.get_step_name())
            config_file = os.path.abspath(self.get_option('config'))
        else:
            conf_data = [
                'BOWTIE2 %s' % self.get_tool('bowtie2'),
                'THREADS %d' % self.get_cores()
            ]
            for db in sorted(self.get_option('databases').items()):
                conf_data.append('DATABASE %s %s BOWTIE2' % db)

        for run_id in cc.keys():
            run = self.declare_run(run_id)
            if not self.get_option('config'):
                if self.get_option('keep config'):
                    config_file = run.add_output_file('fastq_screen.conf',
                                                      'fastq_screen.conf', [])
                else:
                    config_file = run.add_temporary_file('fastq_screen.conf')
                write_conf = [self.get_tool('printf'), '\n'.join(conf_data)]
                execg = run.new_exec_group()
                execg.add_command(write_conf, stdout_path=config_file)
            for input_path in cc[run_id]['in/first_read']:
                file_name = os.path.basename(input_path).rstrip(".fastq.gz")
                # prepare output files
                file_pattern = "%s_screen.txt" % (file_name)
                run.add_output_file("fqc_report", file_pattern, [input_path])

                file_pattern = "%s_screen.png" % (file_name)
                run.add_output_file("fqc_image", file_pattern, [input_path])

                file_pattern = "%s_screen.html" % (file_name)
                run.add_output_file("fqc_html", file_pattern, [input_path])

                file_pattern = "%s-fastqscreen-log_stdout.txt" % (file_name)
                log_stdout = run.add_output_file("log_stdout",
                                                 file_pattern,
                                                 [input_path])

                file_pattern = "%s-fastqscreen-log_stderr.txt" % (file_name)
                log_stderr = run.add_output_file("log_stderr",
                                                 file_pattern,
                                                 [input_path])

                # build fastq_screen command
                fastq_screen_exec_group = run.new_exec_group()
                fastq_screen = [self.get_tool('fastq_screen'),
                                '-conf', config_file]

                if self.get_option('subset'):
                    fastq_screen.extend(['--subset',
                                         str(self.get_option('subset'))])

                if self.get_option('nohits'):
                    file_pattern = "%s.tagged.fastq.gz" % (file_name)
                    run.add_output_file("tagged", file_pattern, [input_path])

                    file_pattern = "%s.tagged_filter.fastq.gz" % (file_name)
                    run.add_output_file("tagged_filter", file_pattern,
                                        [input_path])

                    fastq_screen.extend(['--nohits'])

                fastq_screen.extend(['--outdir', '.', input_path])
                fastq_screen_exec_group.add_command(
                    fastq_screen, stdout_path=log_stdout, stderr_path=log_stderr)
Beispiel #20
0
    def runs(self, run_ids_connections_files):
        # Compile the list of options
        # List of options common for bin and BED-file subcommand
        options = [
            'outFileFormat', 'scaleFactorsMethod', 'sampleLength',
            'numberOfSamples', 'scaleFactors', 'ratio', 'pseudocount',
            'binSize', 'region', 'blackListFileName', 'normalizeTo1x',
            'normalizeUsingRPKM', 'ignoreForNormalization',
            'skipNonCoveredRegions', 'smoothLength', 'extendReads',
            'ignoreDuplicates', 'minMappingQuality', 'centerReads',
            'samFlagInclude', 'samFlagExclude', 'minFragmentLength',
            'maxFragmentLength'
        ]

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]
        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('--%s' % option)
            else:
                option_list.append('--%s' % option)
                option_list.append(str(self.get_option(option)))

        # List of sample lists = losl
        losl = self.get_option('samples')

        # Test the user input and connection data for validity
        for samples in losl:
            if len(samples) != 2:
                raise StepError(
                    self, "Expected exactly two samples. Received %s (%s)" %
                    (len(samples), ", ".join(samples)))

            input_paths = list()
            for sample in samples:
                try:
                    files = run_ids_connections_files[sample]['in/alignments']
                except KeyError as e:
                    raise StepError(
                        self, 'No files found for sample %s and connection '
                        '"in/alignments". Please check your configuration.' %
                        sample)
                if not len(files) == 1 or not files[0].endswith('.bam'):
                    raise StepError(
                        self, "Expected exactly one BAM file, got %s" %
                        ", ".join(files))
                # Add found BAM file to input paths
                input_paths.append(files[0])
            # Assemble new run name from input sample names
            run_id = "%s-%s" % (samples[0], samples[1])

            # Start defining the run here:
            with self.declare_run(run_id) as run:
                # Add output file here:
                outfile = str()
                if self.get_option('outFileFormat') == "bigwig":
                    outfile = run.add_output_file('ucsc-tracks',
                                                  '%s.bw' % run_id,
                                                  input_paths)
                elif self.get_option('outFileFormat') == "bedgraph":
                    outfile = run.add_output_file('ucsc-tracks',
                                                  '%s.bg' % run_id,
                                                  input_paths)
                # Let's compile the command
                with run.new_exec_group() as bam_compare_eg:
                    # 1. bamCompare command
                    bam_compare = [
                        self.get_tool('bamCompare'), '--bamfile1',
                        input_paths[0], '--bamfile2', input_paths[1],
                        '--outFileName', outfile
                    ]
                    # Append number of processors
                    bam_compare.extend(
                        ['--numberOfProcessors',
                         str(self.get_cores())])
                    # Append list of options
                    bam_compare.extend(option_list)

                    bam_compare_eg.add_command(bam_compare)
Beispiel #21
0
    def runs(self, cc):

        read_types = {'first_read': 'R1'}

        paired_end = cc.connection_exists('in/second_read')
        if not cc.all_runs_have_connection('in/first_read'):
            read_name = '' if paired_end else ' first'
            run_ids = list(cc.get_runs_without_any('in/first_read'))
            if len(run_ids) > 5:
                run_ids = run_ids[0:5] + ['...']
            raise StepError(self, '[cutadapt] No%s read passed by runs '
                            '%s.' % (read_name, list(run_ids)))
        if paired_end:
            if not cc.all_runs_have_connection('in/second_read'):
                read_name = ' second'
                run_ids = list(cc.get_runs_without_any('in/second_read'))
                if len(run_ids) > 5:
                    run_ids = run_ids[0:5] + ['...']
                raise StepError(self, '[cutadapt] No%s read passed by runs '
                                '%s.' % (read_name, list(run_ids)))
            read_types['second_read'] = 'R2'

        options = [
            "error-rate",
            "no-indels",
            "times",
            "overlap",
            "match-read-wildcards",
            "discard-trimmed",
            "discard-untrimmed",
            "minimum-length",
            "maximum-length",
            "no-trim",
            "mask-adapter",
            "cut",
            "quality-cutoff",
            "quality-base",
            "prefix",
            "suffix",
            "strip-suffix",
            "colospace",
            "double-encode",
            "trim-primer",
            "strip-f3",
            "maq",
            "bwa",
            "length-tag",
            "no-zero-cap",
            "zero-cap"]

        set_options = [option for option in options if
                       self.is_option_set_in_config(option)]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('--%s' % option)
            else:
                option_list.append('--%s' % option)
                option_list.append(str(self.get_option(option)))

        for run_id in cc.keys():
            run = self.declare_run(run_id)
            for read in read_types:
                connection = 'in/%s' % read
                input_paths = cc[run_id][connection]

                # make sure that adapter-R1/adapter-R2 or adapter-file are
                # correctly set
                # this kind of mutual exclusive option checking is a bit
                # tedious, so we do it here.
                if read == 'second_read':
                    if (not self.is_option_set_in_config('adapter-R2') and
                            not self.is_option_set_in_config('adapter-file')):
                        raise StepError(
                            self, "Option 'adapter-R2' or 'adapter-file' "
                            "required because sample %s is paired end!" %
                            run_id)

                if (self.is_option_set_in_config('adapter-file') and
                        self.is_option_set_in_config('adapter-R1')):
                    raise StepError(self,
                                    "Option 'adapter-R1' and 'adapter-file' "
                                    "are both set but are mutually exclusive!")
                if (not self.is_option_set_in_config('adapter-file') and
                        not self.is_option_set_in_config('adapter-R1')):
                    raise StepError(self,
                                    "Option 'adapter-R1' or 'adapter-file' "
                                    "required to call cutadapt for sample %s!"
                                    % run_id)
                temp_fifos = list()
                exec_group = run.new_exec_group()
                for input_path in input_paths:
                    # 1. Create temporary fifo for every input file
                    temp_fifo = run.add_temporary_file(
                        "fifo-%s" % os.path.basename(input_path))
                    temp_fifos.append(temp_fifo)
                    mkfifo = [self.get_tool('mkfifo'), temp_fifo]
                    exec_group.add_command(mkfifo)
                    # 2. Output files to fifo
                    if input_path.endswith('fastq.gz'):
                        with exec_group.add_pipeline() as pigz_pipe:
                            # 2.1 command: Read file in 4MB chunks
                            dd_in = [
                                self.get_tool('dd'),
                                'ibs=%s' %
                                self.get_option('dd-blocksize'),
                                'if=%s' % input_path
                            ]
                            # 2.2 command: Uncompress file to fifo
                            pigz = [self.get_tool('pigz'),
                                    '--decompress',
                                    '--processes',
                                    str(self.get_cores()),
                                    '--blocksize',
                                    self.get_option('pigz-blocksize'),
                                    '--stdout']
                            # 2.3 command: Write file in 4MB chunks to
                            #              fifo
                            dd_out = [
                                self.get_tool('dd'),
                                'obs=%s' %
                                self.get_option('dd-blocksize'),
                                'of=%s' % temp_fifo
                            ]

                            pigz_pipe.add_command(dd_in)
                            pigz_pipe.add_command(pigz)
                            pigz_pipe.add_command(dd_out)

                    elif input_path.endswith('fastq'):
                        # 2.1 command: Read file in 4MB chunks and
                        #              write to fifo in 4MB chunks
                        dd_in = [
                            self.get_tool('dd'),
                            'bs=%s' % self.get_option('dd-blocksize'),
                            'if=%s' % input_path,
                            'of=%s' % temp_fifo
                        ]
                        exec_group.add_command(dd_in)
                    else:
                        raise StepError(self, "File %s does not end with any "
                                        "expected suffix (fastq.gz or "
                                        "fastq). Please fix that issue.")
                # 3. Read data from fifos
                with exec_group.add_pipeline() as cutadapt_pipe:
                    # 3.1 command: Read from ALL fifos
                    cat = [self.get_tool('cat')]
                    cat.extend(temp_fifos)
                    cutadapt_pipe.add_command(cat)

                    # 3.2 command: Fix qnames if user wants us to
                    if self.get_option('fix_qnames'):
                        fix_qnames = [self.get_tool('fix_qnames')]
                        cutadapt_pipe.add_command(fix_qnames)

                    # Let's get the correct adapter sequences or
                    # adapter sequence fasta file
                    adapter = None
                    # Do we have adapter sequences as input?
                    if self.is_option_set_in_config('adapter-%s'
                                                    % read_types[read]):
                        # Get adapter sequence
                        adapter = self.get_option(
                            'adapter-%s' % read_types[read])

                        # add index to adapter sequence if necessary
                        if '((INDEX))' in adapter:
                            index = self.find_upstream_info_for_input_paths(
                                input_paths,
                                'index-%s' % read_types[read])
                            adapter = adapter.replace('((INDEX))', index)

                        # create reverse complement if necessary
                        if self.get_option('use_reverse_complement'):
                            complements = adapter.maketrans('acgtACGT',
                                                            'tgcaTGCA')
                            adapter = adapter.translate(complements)[::-1]

                        # make sure the adapter is looking good
                        if re.search(r'^[ACGT]+$', adapter) is None:
                            raise StepError(self, "Unable to come up with a "
                                            "legit-looking adapter: %s"
                                            % adapter)
                    # Or do we have a adapter sequence fasta file?
                    elif self.is_option_set_in_config('adapter-file'):
                        adapter_file = os.path.abspath(self.get_option('adapter-file'))
                        adapter = "file:" + adapter_file
                        if not os.path.exists(adapter_file):
                            raise StepError(
                                self, "File %s containing adapter sequences "
                                "does not exist." %
                                self.get_option('adapter-file'))

                    # 3.3 command: Clip adapters
                    cutadapt = [self.get_tool('cutadapt'),
                                self.get_option('adapter-type'),
                                adapter, '-']
                    cutadapt.extend(option_list)

                    cutadapt_log_file = run.add_output_file(
                        'log_%s' % read,
                        '%s-cutadapt-%s-log.txt'
                        % (run_id, read_types[read]),
                        input_paths)

                    # 3.4 command: Compress output
                    pigz = [self.get_tool('pigz'),
                            '--processes', str(self.get_cores()),
                            '--blocksize', self.get_option('pigz-blocksize'),
                            '--stdout']
                    # 3.5 command: Write to output file in 4MB chunks
                    clipped_fastq_file = run.add_output_file(
                        "%s" % read,
                        "%s_%s.fastq.gz" %
                        (run_id, read_types[read]),
                        input_paths)

                    dd = [
                        self.get_tool('dd'),
                        'obs=%s' % self.get_option('dd-blocksize'),
                        'of=%s' % clipped_fastq_file
                    ]

                    cutadapt_pipe.add_command(cutadapt,
                                              stderr_path=cutadapt_log_file)
                    cutadapt_pipe.add_command(pigz)
                    cutadapt_pipe.add_command(dd)
Beispiel #22
0
Datei: pepr.py Projekt: yigbt/uap
    def runs(self, run_ids_connections_files):
        # Compile the list of options
        options = [
            'file-format', 'normalization', 'peaktype', 'shiftsize',
            'threshold', 'windowsize'
        ]

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('--%s' % option)
            else:
                option_list.append('--%s' % option)
                option_list.append(str(self.get_option(option)))

        # Get the essential dictionary with information about the relationship
        # between Input and ChIP samples
        chip_vs_input = self.get_option('chip_vs_input')
        # the highest level keys of the dict are the new runID
        for run_id in chip_vs_input.keys():

            in_files = dict()
            config_to_option = dict()
            # Are we going to perform differential peak calling yes or no?
            if not self.get_option('diff'):
                # If not we only use chip1 and input1
                config_to_option = {'rep1': 'chip1', 'inputs1': 'input1'}
            else:
                # Else we require chip1+input1 and chip2+input2
                config_to_option = {
                    'rep1': 'chip1',
                    'inputs1': 'input1',
                    'rep2': 'chip2',
                    'inputs2': 'input2'
                }
            # Check the input from the chip_vs_input dict
            for key, opt in config_to_option.items():
                experiment = chip_vs_input[run_id]
                in_files[opt] = list()
                try:
                    # in_run_id: run ID whose in/alignments files
                    #            are used for pepr's --[chip[12]|input[12]]
                    for in_run_id in experiment[key]:
                        in_files[opt].extend(
                            run_ids_connections_files[in_run_id]
                            ['in/alignments'])
                        if run_ids_connections_files[in_run_id][
                                'in/alignments'] == [None]:
                            raise StepError(
                                self, "Upstream run %s provides no "
                                "alignments for run %s" % (in_run_id, run_id))
                except KeyError as e:
                    raise StepError(
                        self, "Required key %s missing in 'chip_vs_input' "
                        "for run %s" % (key, run_id))

            # Create a new run named run_id
            with self.declare_run(run_id) as run:
                # Assemble list of all input files
                input_paths = [f for k in in_files for f in in_files[k]]

                # result_files dict:
                # keys = temporary file names
                # values = final file names
                result_files = dict()

                # Is differential peak calling happening?
                if self.get_option('diff'):
                    # If yes we do not get any normal peaks
                    run.add_empty_output_connection("peaks")
                    # but we do get two peak lists with differential peaks
                    chip1_file = '%s__PePr_chip1_peaks.bed' % run_id
                    result_files[chip1_file] = run.add_output_file(
                        'differential_peaks', chip1_file, input_paths)
                    chip2_file = '%s__PePr_chip2_peaks.bed' % run_id
                    result_files[chip2_file] = run.add_output_file(
                        'differential_peaks', chip2_file, input_paths)
                else:
                    # If no we do not get any differential_peaks
                    run.add_empty_output_connection("differential_peaks")
                    # but we do get a peak file
                    peaks_file = '%s__PePr_peaks.bed' % run_id
                    result_files[peaks_file] = run.add_output_file(
                        'peaks', peaks_file, input_paths)

                # parameter file used to run PePr with
                parameter_file = '%s__PePr_parameters.txt' % run_id
                result_files[parameter_file] = run.add_output_file(
                    'parameter', parameter_file, input_paths)

                # temp_dir holds temporary directory path
                temp_dir = str()
                with run.new_exec_group() as pepr_exec_group:
                    # 1. Create temporary directory for PePr output
                    temp_dir = 'pepr-out'
                    mkdir = [self.get_tool('mkdir'), temp_dir]
                    pepr_exec_group.add_command(mkdir)

                    # 2. Compile the PePr command
                    pepr = [
                        self.get_tool('pepr'), '--output-directory', temp_dir,
                        '--file-format',
                        self.get_option('file-format'), '--name', run_id
                    ]
                    # Add '--[chip[12]|input[12]]' and comma separated list of
                    # alignment files
                    for opt in in_files.keys():
                        pepr.append('--%s' % opt)
                        pepr.append(','.join(in_files[opt]))

                    if self.get_option('diff'):
                        pepr.append('--diff')
                    # Add additional options
                    pepr.extend(option_list)
                    pepr_exec_group.add_command(pepr)

                with run.new_exec_group() as mv_exec_group:
                    for orig, dest_path in result_files.items():
                        # 3. Move file from temp directory to expected
                        #    position
                        orig_path = os.path.join(temp_dir, orig)
                        mv = [self.get_tool('mv'), orig_path, dest_path]
                        mv_exec_group.add_command(mv)

                with run.new_exec_group() as tar_exec_group:
                    #
                    log_file = run.add_output_file(
                        'log', '%s__PePr_debug_log.tar.gz' % run_id,
                        input_paths)
                    # We need to compress the temp directory (which should only
                    # contain the log file) and delete all files in there
                    tar = [
                        self.get_tool('tar'), '--create', '--gzip',
                        '--verbose', '--remove-files',
                        '--file=%s' % log_file, temp_dir
                    ]
                    tar_exec_group.add_command(tar)
Beispiel #23
0
    def runs(self, run_ids_connections_files):

        options = [
            # Standard Picard Options:
            'TMP_DIR',
            'VERBOSITY',
            'QUIET',
            'VALIDATION_STRINGENCY',
            'COMPRESSION_LEVEL',
            'MAX_RECORDS_IN_RAM',
            'CREATE_INDEX',
            'CREATE_MD5_FILE',
            'REFERENCE_SEQUENCE',
            'GA4GH_CLIENT_SECRETS',
            # Picard MarkDuplicates Options:
            'SORT_ORDER',
            'RGID',
            'RGLB',
            'RGPL',
            'RGPU',
            'RGCN',
            'RGDS',
            'RGDT',
            'RGPI',
            'RGPG',
            'RGPM'
        ]
        file_options = ['TMP_DIR', 'REFERENCE_SEQUENCE']

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('%s=true' % option)
                else:
                    option_list.append('%s=false' % option)
            else:
                value = str(self.get_option(option))
                if option in file_options:
                    value = os.path.abspath(value)
                option_list.append('%s=%s' % (option, value))

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    'in/alignments']

                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) != 1:
                    raise StepError(self,
                                    "Expected exactly one alignments file.")
                elif os.path.splitext(
                        input_paths[0])[1] not in ['.sam', '.bam']:
                    raise StepError(
                        self,
                        "The file %s seems not to be a SAM or BAM file. At "
                        "least the suffix is wrong." % input_paths[0])
                else:
                    with run.new_exec_group() as exec_group:
                        alignments = run.add_output_file(
                            'alignments', os.path.basename(input_paths[0]),
                            input_paths)
                        add_replace_read_groups = [
                            self.get_tool('picard-tools'),
                            'AddOrReplaceReadGroups',
                            'INPUT=%s' % input_paths[0],
                            'OUTPUT=%s' % alignments,
                            'RGSM=%s' % run_id
                        ]
                        add_replace_read_groups.extend(option_list)
                        exec_group.add_command(add_replace_read_groups)
    def runs(self, run_ids_connections_files):
        # Compile the list of options
        options = [
            'histogram', 'maxFragmentLength', 'logScale', 'binSize',
            'distanceBetweenBins', 'blackListFileName'
        ]
        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]
        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('--%s' % option)
            else:
                option_list.append('--%s' % option)
                option_list.append(str(self.get_option(option)))

        def declare_bamPEFragmentSize(run_id, input_paths, labels):
            with self.declare_run(run_id) as run:
                # Let's compile the command
                with run.new_exec_group() as bamPEFragmentSize_eg:
                    # 1. bamPEFragmentSize command
                    bamPEFragmentSize = [
                        self.get_tool('bamPEFragmentSize'),
                        '--numberOfProcessors',
                        self.get_cores(), '--bamfiles'
                    ]
                    bamPEFragmentSize.extend(input_paths)

                    # Set options for plot creation
                    if self.is_option_set_in_config('histogram'):
                        bamPEFragmentSize.append('--histogram')
                        bamPEFragmentSize.append(
                            run.add_output_file('fragment_size_plots',
                                                '%s.png' % run_id,
                                                input_paths))
                        bamPEFragmentSize.append('--plotTitle')
                        bamPEFragmentSize.append(run_id)
                        bamPEFragmentSize.append('--samplesLabel')
                        bamPEFragmentSize.extend(labels)
                        if self.is_option_set_in_config('logScale'):
                            bamPEFragmentSize.append('--logScale')

                    # Append list of options
                    bamPEFragmentSize.extend(option_list)

                    bamPEFragmentSize_eg.add_command(
                        bamPEFragmentSize,
                        stdout_path=run.add_output_file(
                            'fragment_size_stats',
                            '%s-PEFragmentSize.stats' % run_id, input_paths))

        run_id = str()
        input_paths = list()
        labels = list()
        if self.is_option_set_in_config('samples'):
            runIds_samples = self.get_option('samples')

            for run_id, samples in runIds_samples.items():
                if not isinstance(run_id, str):
                    raise StepError(
                        self, "Not a string run ID (%s) for samples (%s)" %
                        (run_id, ", ".join(samples)))
                if not isinstance(samples, list):
                    raise StepError(
                        self, "Not a list of samples. Type: %s, Value: %s" %
                        (type(samples), samples))

                for sample in samples:
                    try:
                        bam_files = run_ids_connections_files[sample][
                            'in/alignments']
                    except KeyError:
                        raise StepError(self,
                                        "No input sample named %s" % sample)

                    for i in range(len(bam_files)):
                        if not bam_files[i].endswith(".bam"):
                            raise StepError(
                                self, "Not a BAM file: %s" % bam_files[i])
                        input_paths.append(bam_files[i])
                        if i > 0:
                            labels.append("%s-%s" % (sample, i))
                        else:
                            labels.append(sample)
                # Start declaring the command
                declare_bamPEFragmentSize(run_id, input_paths, labels)

        else:
            for run_id in run_ids_connections_files.keys():
                try:
                    input_paths = run_ids_connections_files[run_id][
                        'in/alignments']
                except KeyError:
                    raise StepError(
                        self, 'No files found for run-id %s and connection '
                        '"in/alignments". Please check your configuration.' %
                        run_id)
                for f in input_paths:
                    label = run_id
                    if len(input_paths) > 1:
                        label = "%s-%s" % (run_id, input_paths.index(f))
                    declare_bamPEFragmentSize(run_id, f, [label])
Beispiel #25
0
    def runs(self, run_ids_connections_files):

        options = [
            # Standard Picard Options:
            'TMP_DIR',
            'VERBOSITY',
            'QUIET',
            'VALIDATION_STRINGENCY',
            'COMPRESSION_LEVEL',
            'MAX_RECORDS_IN_RAM',
            'CREATE_INDEX',
            'CREATE_MD5_FILE',
            'REFERENCE_SEQUENCE',
            'GA4GH_CLIENT_SECRETS',
            # Picard MarkDuplicates Options:
            'SORT_ORDER',
            'ASSUME_SORTED',
            'MERGE_SEQUENCE_DICTIONARIES',
            'USE_THREADING',
            'COMMENT',
            'INTERVALS'
        ]
        file_options = ['TMP_DIR', 'REFERENCE_SEQUENCE']

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('%s=true' % option)
                else:
                    option_list.append('%s=false' % option)
            else:
                value = str(self.get_option(option))
                if option in file_options:
                    value = os.path.abspath(value)
                option_list.append('%s=%s' % (option, value))

        for run_id in run_ids_connections_files.keys():

            with self.declare_run(run_id) as run:
                input_paths = run_ids_connections_files[run_id][
                    'in/alignments']

                if input_paths == [None]:
                    run.add_empty_output_connection("alignments")
                elif os.path.splitext(
                        input_paths[0])[1] not in ['.sam', '.bam']:
                    raise StepError(
                        self,
                        "The file %s seems not to be a SAM or BAM file. At "
                        "least the suffix is wrong." % input_paths[0])
                elif self.is_option_set_in_config("INTERVALS") and \
                        not os.path.exists(self.get_option("INTERVALS")):
                    raise StepError(
                        self, "The path %s given to option 'INTERVALS' is "
                        "not pointing to a file.")
                elif len(input_paths) == 0:
                    run.add_empty_output_connection("alignments")
                elif len(input_paths) == 1:
                    base = os.path.basename(input_paths[0])
                    with run.new_exec_group() as ln_alignment:
                        # 1. command: Create symbolic link to original bam file
                        # (use absolute path)
                        ln = [
                            self.get_tool('ln'), '-s', input_paths[0],
                            run.add_output_file('alignments', base,
                                                input_paths)
                        ]
                        ln_alignment.add_command(ln)

                else:
                    with run.new_exec_group() as exec_group:
                        alignments = run.add_output_file(
                            'alignments', '%s-merged.bam' % run_id,
                            input_paths)
                        merge_sam_files = [
                            self.get_tool('picard-tools'), 'MergeSamFiles'
                        ]
                        for f in input_paths:
                            merge_sam_files.append('INPUT=%s' % f)
                        merge_sam_files.append('OUTPUT=%s' % alignments)
                        merge_sam_files.extend(option_list)
                        exec_group.add_command(merge_sam_files)
Beispiel #26
0
    def runs(self, run_ids_connections_files):

        options = [
            'b',
            'color',
            'd',
            'e',
            'h',
            'holdcolumnorder',
            'init',
            'l',
            'm',
            'nobed',
            'nobrowser',
            'noenrich',
            # 'printposterior', 'printstatebyline',
            'r',
            's',
            'stateordering',
            't',
            'x',
            'z'
        ]
        file_options = ['assembly', 'l', 'm']

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                # Only set option if it is True
                if self.get_option(option):
                    option_list.append('-%s' % option)
            else:
                value = str(self.get_option(option))
                if option in file_options:
                    value = os.path.abspath(value)
                option_list.append('-%s' % option)
                option_list.append(value)

        for run_id in run_ids_connections_files.keys():
            # The input_paths should be a single tar.gz file
            input_paths = run_ids_connections_files[run_id][
                'in/chromhmm_binarization']
            # Test the input_paths (at least a bit)
            if len(input_paths) != 1 or not input_paths[0].endswith('.tar.gz'):
                raise StepError(
                    self, "Expected single tar.gz file via "
                    "'in/chromhmm_binarization' for run %s, but got "
                    "this %s" % (run_id, ", ".join(input_paths)))

            # read tar file and get names of included files
#            with tarfile.open(name = input_paths[0], mode = 'r:gz') as tar:
#                tar.list()

            with self.declare_run(run_id) as run:
                with run.new_exec_group() as pre_chromhmm:
                    # 1. Extract the binary files into a directory
                    # 1.1 Get name of temporary input directory
                    input_dir = run.add_temporary_directory('%s_binary_files' %
                                                            run_id)
                    # 1.2 Create temporary input directory
                    mkdir = [self.get_tool('mkdir'), input_dir]
                    pre_chromhmm.add_command(mkdir)
                    # 1.3 Extract the binary files into temporary input
                    # directory
                    tar = [
                        self.get_tool('tar'), '--extract', '--gzip',
                        '--verbose', '--directory', input_dir, '--file',
                        input_paths[0]
                    ]
                    pre_chromhmm.add_command(tar)
                    # 1.4 Get name of temporary output directory
                    output_dir = run.add_temporary_directory(
                        '%s_chromhmm_model' % run_id)
                    # 1.5 Create temporary output directory
                    mkdir = [self.get_tool('mkdir'), output_dir]
                    pre_chromhmm.add_command(mkdir)

                with run.new_exec_group() as learnmodel:
                    # 2. Assemble ChromHMM LearnModel command
                    chromhmm = [self.get_tool('ChromHMM'), 'LearnModel']
                    chromhmm.extend(option_list)
                    chromhmm.append(input_dir)
                    chromhmm.append(output_dir)
                    chromhmm.append(str(self.get_option('numstates')))
                    chromhmm.append(
                        os.path.abspath(self.get_option('assembly')))
                    learnmodel.add_command(chromhmm)

                with run.new_exec_group() as pack_model:
                    # 3. Pack the output files of ChromHMM LearnModel
                    with pack_model.add_pipeline() as pack_model_pipe:
                        # 3.1 List content of output directory
                        ls = [self.get_tool('ls'), '-1', output_dir]
                        # 3.2 Pipe ls output
                        pack_model_pipe.add_command(ls)
                        # 3.3 Use xargs to call tar (circumventing glob
                        # pattern)
                        xargs = [
                            self.get_tool('xargs'), '--delimiter', '\n',
                            self.get_tool('tar'), '--create', '--directory',
                            output_dir, '--gzip', '--remove-files',
                            '--verbose', '--file',
                            run.add_output_file(
                                'chromhmm_model',
                                '%s_model_files.tar.gz' % run_id, input_paths)
                        ]
                        pack_model_pipe.add_command(xargs)

                with run.new_exec_group() as rm_binary_files:
                    # 4. Remove the unpacked binary files
                    with rm_binary_files.add_pipeline() as rm_binary_pipe:
                        # 4.1 List content of output directory
                        ls = [self.get_tool('ls'), '-1', input_dir]
                        # 4.2 Pipe ls output
                        rm_binary_pipe.add_command(ls)
                        # 4.3 Use xargs to call tar (circumventing glob
                        # pattern)
                        xargs = [
                            self.get_tool('xargs'), '--delimiter', '\n', '-I',
                            '*',
                            self.get_tool('rm'), '--verbose',
                            os.path.join(input_dir, '*')
                        ]
                        rm_binary_pipe.add_command(xargs)
Beispiel #27
0
    def runs(self, run_ids_connections_files):

        # Check if index is valid
        if not os.path.exists(self.get_option('index') + '.bwt'):
            raise StepError(
                self, "Could not find index: %s.*" % self.get_option('index'))

        # Compile the list of options
        options = [
            # [Algorithm options:]
            't',
            'k',
            'w',
            'd',
            'r',
            'y',
            'c',
            'D',
            'W',
            'm',
            'S',
            'P',
            'e',
            # [Scoring options:]
            'A',
            'B',
            'O',
            'E',
            'L',
            'U',
            'x',
            # [Input/output options:]
            'p',
            'R',
            'H',
            'j',
            'v',
            'T',
            'h',
            'a',
            'C',
            'V',
            'Y',
            'M'
        ]

        set_options = [
            option for option in options
            if self.is_option_set_in_config(option)
        ]

        option_list = list()
        for option in set_options:
            if isinstance(self.get_option(option), bool):
                if self.get_option(option):
                    option_list.append('-%s' % option)
            else:
                option_list.append('-%s' % option)
                option_list.append(str(self.get_option(option)))

        for run_id in run_ids_connections_files.keys():
            with self.declare_run(run_id) as run:
                # Get list of files for first/second read
                fr_input = run_ids_connections_files[run_id]['in/first_read']
                sr_input = run_ids_connections_files[run_id]['in/second_read']

                input_paths = [
                    y for x in [fr_input, sr_input] for y in x if y is not None
                ]

                # Do we have paired end data and is it exactly one ?
                is_paired_end = False if sr_input == [None] else True

                # Fail if we have don't have exactly one file or
                # an empty connection
                if len(fr_input) != 1 or fr_input == [None]:
                    raise StepError(
                        self, "Expected single input file for first read.")
                # Fail if we don't have exactly one file
                if is_paired_end and len(sr_input) != 1:
                    raise StepError(
                        self, "Expected single input file for second read.")
                input_paths = fr_input  # single element list
                if is_paired_end:
                    input_paths.extend(sr_input)

                # Check file endings for proper type
                for input_path in input_paths:
                    if len([
                            _ for _ in ['fastq', 'fq', 'fq.gz', 'fastq.gz']
                            if input_path.endswith(_)
                    ]) != 1:
                        raise StepError(
                            self, "%s possess unknown suffix. "
                            "(None of: fastq, fq, fq.gz, fastq.gz)")
                # BWA can handle only single files for first and second read
                # IMPORTANT: BWA handles gzipped as well as not gzipped files

                with run.new_exec_group() as exec_group:

                    def prepare_input(input_path, exec_group):
                        # Create temporary fifo
                        temp_fifo = run.add_temporary_file(
                            'in-fifo-%s' % os.path.basename(input_path))
                        mkfifo = [self.get_tool('mkfifo'), temp_fifo]
                        exec_group.add_command(mkfifo)
                        dd = [
                            self.get_tool('dd'),
                            'bs=%s' % self.get_option('dd-blocksize'),
                            'if=%s' % input_path,
                            'of=%s' % temp_fifo
                        ]
                        exec_group.add_command(dd)

                        return (exec_group, temp_fifo)

                    # Temporary fifos
                    temp_fr_fifo, temp_sr_fifo = (str, str)

                    exec_group, temp_fr_fifo = prepare_input(
                        fr_input[0], exec_group)
                    # And if we handle paired end data
                    if is_paired_end:
                        exec_group, temp_sr_fifo = prepare_input(
                            sr_input[0], exec_group)

                    # 3. Map reads using bwa mem
                    with exec_group.add_pipeline() as bwa_mem_pipe:
                        # Assemble bwa mem command
                        bwa_mem = [
                            self.get_tool('bwa'),
                            'mem',
                        ]
                        bwa_mem.extend(option_list)
                        bwa_mem.append(self.get_option('index'))
                        bwa_mem.append(temp_fr_fifo)

                        if is_paired_end:
                            bwa_mem.append(temp_sr_fifo)

                        bwa_mem_pipe.add_command(bwa_mem)
                        # Compress bwa mem output
                        pigz = [self.get_tool('pigz'), '--stdout']
                        bwa_mem_pipe.add_command(pigz)
                        # Write bowtie2 output to file
                        dd = [
                            self.get_tool('dd'),
                            'obs=%s' % self.get_option('dd-blocksize'),
                            'of=%s' % run.add_output_file(
                                'alignments', '%s-bwa-mem.sam.gz' % run_id,
                                input_paths)
                        ]
                        bwa_mem_pipe.add_command(dd)
Beispiel #28
0
Datei: star.py Projekt: yigbt/uap
    def runs(self, run_ids_connections_files):
        self.set_cores(self.get_option('cores'))

        for run_id in run_ids_connections_files.keys():
            with self.declare_run(run_id) as run:
                input_fileset = []
                r1 = run_ids_connections_files[run_id]['in/first_read'][0]
                input_fileset.append(r1)

                r2 = None
                if 'in/second_read' in run_ids_connections_files[run_id]:
                    r2 = run_ids_connections_files[run_id]['in/second_read'][0]
                    input_fileset.append(r2)

                star = [self.get_tool('star')]

                # get genomeDir from config or from input files
                if self.is_option_set_in_config('genomeDir'):
                    genome_dir = os.path.abspath(
                        str(self.get_option('genomeDir')))
                else:
                    if 'in/genome_dir' not in run_ids_connections_files[
                            run_id]:
                        raise StepError(
                            self, 'Required parameter "GenomDir" wasnt found!')
                    genome_dir = run_ids_connections_files[run_id][
                        'in/genome_dir'][0]

                star.extend(['--genomeDir', genome_dir])

                star.extend(['--outFileNamePrefix', './'])

                if self.is_option_set_in_config('readFilesCommand'):
                    star.extend([
                        '--readFilesCommand',
                        self.get_option('readFilesCommand')
                    ])

                if self.is_option_set_in_config('cores'):
                    star.extend(
                        ['--runThreadN',
                         str(self.get_option('runThreadN'))])

                star.append('--readFilesIn')
                star.extend(input_fileset)

                stderr_file = "%s-star-log_stderr.txt" % (run_id)
                log_stderr = run.add_output_file("log_stderr", stderr_file,
                                                 input_fileset)
                stdout_file = "%s-star-log_stdout.txt" % (run_id)
                log_stdout = run.add_output_file("log_stdout", stdout_file,
                                                 input_fileset)

                run.add_output_file("aligned", "Aligned.out.sam",
                                    input_fileset)
                run.add_output_file("log.final", "Log.final.out",
                                    input_fileset)
                run.add_output_file("log.out", "Log.out", input_fileset)
                run.add_output_file("log.progess", "Log.progress.out",
                                    input_fileset)
                run.add_output_file("sj.out", "SJ.out.tab", input_fileset)

                star_eg = run.new_exec_group()
                star_eg.add_command(star,
                                    stdout_path=log_stdout,
                                    stderr_path=log_stderr)
Beispiel #29
0
    def runs(self, cc):

        read_types = {'first_read': '_R1', 'second_read': '_R2'}
        for run_id in cc.keys():
            cc.switch_run_id(run_id)
            with self.declare_run(run_id) as run:
                for read in read_types:
                    if not cc.exists_connection_for_run(f'in/{read}'):
                        continue
                    connection = 'in/%s' % read
                    input_paths = cc[run_id][connection]

                    if input_paths == [None]:
                        run.add_empty_output_connection("%s" % read)
                    else:
                        temp_fifos = list()
                        exec_group = run.new_exec_group()
                        for input_path in input_paths:
                            # Gzipped files are unpacked first
                            # !!! Might be worth a try to use fifos instead of
                            #     temp files!!!
                            # 1. Create temporary fifo
                            temp_fifo = run.add_temporary_file(
                                "fifo-%s" %
                                os.path.basename(input_path))
                            temp_fifos.append(temp_fifo)
                            mkfifo = [self.get_tool('mkfifo'), temp_fifo]
                            exec_group.add_command(mkfifo)

                            is_gzipped = True if os.path.splitext(input_path)[1]\
                                in ['.gz', '.gzip'] else False

                            # 2. Output files to fifo
                            if is_gzipped:
                                with exec_group.add_pipeline() as unzip_pipe:
                                    # 2.1 command: Read file in 'dd-blocksize'
                                    # chunks
                                    dd_in = [
                                        self.get_tool('dd'),
                                        'ibs=%s' %
                                        self.get_option('dd-blocksize'),
                                        'if=%s' % input_path
                                    ]
                                    # 2.2 command: Uncompress file to fifo
                                    pigz = [self.get_tool('pigz'),
                                            '--processes',
                                            str(self.get_cores()),
                                            '--decompress',
                                            '--blocksize',
                                            self.get_option('pigz-blocksize'),
                                            '--stdout']
                                    # 2.3 Write file in 'dd-blocksize' chunks
                                    # to fifo
                                    dd_out = [
                                        self.get_tool('dd'),
                                        'obs=%s' %
                                        self.get_option('dd-blocksize'),
                                        'of=%s' % temp_fifo
                                    ]

                                    unzip_pipe.add_command(dd_in)
                                    unzip_pipe.add_command(pigz)
                                    unzip_pipe.add_command(dd_out)
                            elif os.path.splitext(input_path)[1] in\
                                    ['.fastq', '.fq']:
                                # 2.1 command: Read file in 'dd-blocksize' chunks and
                                # write to fifo in 'dd-blocksize' chunks
                                dd_in = [
                                    self.get_tool('dd'),
                                    'bs=%s' %
                                    self.get_option('dd-blocksize'),
                                    'if=%s' % input_path,
                                    'of=%s' % temp_fifo
                                ]
                                exec_group.add_command(dd_in)
                            else:
                                raise StepError(
                                    self, "File %s does not end with any "
                                    "expected suffix (fastq.gz or "
                                    "fastq). Please fix that issue." %
                                    input_path)
                        # 3. Read data from fifos
                        with exec_group.add_pipeline() as pigz_pipe:
                            # 3.1 command: Read from ALL fifos
                            cat = [self.get_tool('cat')]
                            cat.extend(temp_fifos)
                            pigz_pipe.add_command(cat)

                            # 3.2 Gzip output file
                            # if self.get_option('compress-output'):
                            pigz = [self.get_tool('pigz'),
                                    '--processes',
                                    str(self.get_cores()),
                                    '--blocksize',
                                    self.get_option('pigz-blocksize'),
                                    '--stdout']
                            pigz_pipe.add_command(pigz)

                            # 3.3 command: Write to output file in
                            # 'dd-blocksize' chunks
                            stdout_path = run.add_output_file(
                                "%s" % read,
                                "%s%s.fastq.gz" %
                                (run_id, read_types[read]),
                                input_paths)
                            dd = [
                                self.get_tool('dd'),
                                'obs=%s' % self.get_option('dd-blocksize'),
                                'of=%s' % stdout_path
                            ]
                            pigz_pipe.add_command(dd)
Beispiel #30
0
    def runs(self, run_ids_connections_files):

        isset_n = self.is_option_set_in_config('n')
        isset_p = self.is_option_set_in_config('p')

        if isset_n and isset_p:
            raise StepError(
                self, "Option n AND p are set in config.yaml. "
                "Only one is allowed.")

        config_options = self.get_options()

        read_types = {'first_read': '_R1', 'second_read': '_R2'}
        for run_id in run_ids_connections_files.keys():
            new_run_id = run_id
            # create new run id if option o isset
            if self.is_option_set_in_config('o'):
                new_run_id = config_options['o'] + '_' + run_id

            with self.declare_run(new_run_id) as run:

                for read in read_types:
                    connection = 'in/%s' % read
                    input_paths = run_ids_connections_files[run_id].get(
                        connection)

                    if input_paths:
                        for input_path in input_paths:
                            # Get base name of input file
                            root, ext = os.path.splitext(
                                os.path.basename(input_path))

                            temp_file = input_path

                            is_gzipped = False
                            file_ext = os.path.splitext(input_path)[1]
                            is_gzipped = True if file_ext\
                                in ['.gz', '.gzip'] else False

                            if is_gzipped:
                                parts = os.path.basename(input_path).split('.')
                                root = '.'.join(parts[:-2])

                                # Unzip fastq
                                temp_file = run.add_temporary_file()
                                pigz_decompress_eg = run.new_exec_group()
                                pigz = [
                                    self.get_tool('pigz'), '--processes',
                                    str(self.get_cores()), '--decompress',
                                    '--keep', '--stdout', input_path
                                ]

                                pigz_decompress_eg.add_command(
                                    pigz, stdout_path=temp_file)
                            # 1. Run fastqc for input file
                            fastqsample_eg = run.new_exec_group()

                            # @todo: its impossible to get a shorter line at
                            # this position for pep8-compatibility...
                            # maybe rename method?
                            outfile = "sample"

                            fastqsample = [self.get_tool('fastq-sample')]

                            for option, value in config_options.items():
                                if option in self.possible_options:
                                    if option == 'o' or value is None:
                                        continue
                                    fastqsample.extend(
                                        ['-%s' % (option),
                                         str(value)])

                            fastqsample.extend(['-o', outfile])
                            fastqsample.append(temp_file)
                            fastqsample_eg.add_command(fastqsample)

                            # output compress subsample
                            filename_params = (new_run_id, read_types[read])

                            subsample_file = run.add_output_file(
                                "%s" % read, "%s%s.fastq.gz" % filename_params,
                                [input_path])

                            pigz_compress_eg = run.new_exec_group()
                            pigz_compress = [
                                self.get_tool('pigz'), '--processes',
                                str(self.get_cores()), '--best', '--stdout',
                                outfile + '.fastq'
                            ]
                            pigz_compress_eg.add_command(
                                pigz_compress, stdout_path=subsample_file)

                            # deletions
                            remove_eg = run.new_exec_group()
                            remove = [self.get_tool('rm'), outfile + '.fastq']
                            remove_eg.add_command(remove)