def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id]['in/alignments'] # check, if only a single input file is provided if len(input_paths) != 1: raise Exception( "Expected exactly one alignments file., but got this %s" % input_paths) if self.is_option_set_in_config('tmp_dir'): if not os.path.isdir(self.get_option('tmp_dir')): # dir not present raise StepError( self, "Directory %s not found" % self.get_option('tmp_dir')) if not os.access(self.get_option('tmp_dir'), os.W_OK): #not accessible raise StepError( self, "Directory %s not accessible." % self.get_option('tmp_dir')) alignments_path = input_paths[0] cat = [self.get_tool('cat'), alignments_path] # pigz = [self.get_tool('pigz'), '--decompress', '--processes', '1', '--stdout'] pigz = [self.get_tool('pigz'), '--decompress', '--processes', str(self.get_cores()), '--stdout'] s2c = [ self.get_tool('s2c'), '-s', '/dev/stdin', '-o', self.get_option('tmp_dir')] if self.is_option_set_in_config('maxDist'): s2c.extend(['-d', str(self.get_option('maxDist'))]) # schreibt .sam nach stdout fix_s2c = [self.get_tool('fix_s2c')] # pigz2 = [self.get_tool('pigz'), '--processes', '2', '--stdout'] pigz2 = [self.get_tool('pigz'), '--processes', str(self.get_cores()), '--stdout'] with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as s2c_pipe: s2c_pipe.add_command(cat) s2c_pipe.add_command(pigz) s2c_pipe.add_command(s2c) s2c_pipe.add_command(fix_s2c) s2c_pipe.add_command( pigz2, stdout_path=run.add_output_file( 'alignments', '%s-cufflinks-compatible.sam.gz' % run_id, input_paths))
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] # Add empty out connection if we have an empty in connection if input_paths == [None]: run.add_empty_output_connection("alignments") run.add_empty_output_connection("indices") # Fail if we haven't exactly one input file elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") # Fail if the input is not a bam file elif os.path.splitext(input_paths[0])[1] not in ['.bam']: raise StepError( self, "The file %s seems not to be a BAM file. At " "least the suffix is wrong." % input_paths[0]) # Everything seems fine, lets start else: input_bam = input_paths[0] base = os.path.basename(input_bam) # At first create the index and a symlink to original BAM with run.new_exec_group() as link_exgr: # 1. command: Create symbolic link to original bam file # (use absolute path) ln = [self.get_tool('ln'), '-s', input_bam] bam_link = run.add_output_file('alignments', base, input_paths) ln.append(bam_link) link_exgr.add_command(ln) with run.new_exec_group() as index_exgr: # 2. command: Index bam file samtools_index = [self.get_tool('samtools'), 'index'] if self.get_option('index_type') == 'bai': samtools_index.append('-b') run.add_output_file('indices', '%s.bai' % base, input_paths) elif self.get_option('index_tpye') == 'csi': samtools_index.append('-c') run.add_output_file('indices', '%s.csi' % base, input_paths) samtools_index.append(bam_link) index_exgr.add_command(samtools_index) # Calculate samtools idxstats with run.new_exec_group() as idxstats_exgr: samtools_idxstats = [ self.get_tool('samtools'), 'idxstats' ] samtools_idxstats.append(bam_link) idxstats_exgr.add_command( samtools_idxstats, stdout_path=run.add_output_file( 'index_stats', '%s_idxstats.txt' % base, input_paths))
def runs(self, run_ids_connections_files): # Check if chromosome sizes points to a real file if not os.path.isfile(self.get_option('chromosome-sizes')): raise StepError( self, "Value for option 'chromosome-sizes' is not a " "file: %s" % self.get_option('chromosome-sizes')) if self.get_option('temp-sort-dir') and \ not os.path.isdir(self.get_option('temp-sort-dir')): raise StepError( self, "Value for option 'temp-sort-dir' is not a " "directory: %s" % self.get_option('temp-sort-dir')) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: # Collect input paths input_paths = run_ids_connections_files[run_id][ "in/alignments"] # Handle special condition e.g. no input files if input_paths == [None]: run.add_empty_output_connection("alignments") # Complain if necessary elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") root, ext = os.path.splitext(os.path.basename(input_paths[0])) # Complain if necessary if not ext == '.bam': raise StepError( self, "The file %s does not appear to be any " "of bam.gz, bam.gzip, or bam" % input_paths[0]) bedgraph_file = run.add_output_file('bedgraph', '%s.bg' % run_id, input_paths) bigwig_file = run.add_output_file('bigwig', '%s.bw' % run_id, input_paths) # Start creation of BedGraph files with run.new_exec_group() as bedgraph_group: with bedgraph_group.add_pipeline() as pipe: # BAM -> BedGraph # (necessary for bedGraph, bigWig) genomecov = [self.get_tool('bedtools'), 'genomecov'] genomecov.append('-bg') genomecov.append('-ibam') genomecov.extend(input_paths) pipe.add_command(genomecov) sort = [self.get_tool('sort'), '-k1,1', '-k2,2n'] pipe.add_command(sort, stdout_path=bedgraph_file) with run.new_exec_group() as bigwig_group: bedgraph_to_bigwig = [ self.get_tool('bedGraphToBigWig'), bedgraph_file, self.get_option('chromosome-sizes'), bigwig_file ] bigwig_group.add_command(bedgraph_to_bigwig)
def runs(self, run_ids_connections_files): run_id_sheme = self.get_option('name_sheme') prefix = self.get_option('prefix') if prefix: run_id_sheme = '%s_%%s_R1' % prefix logger.warning("[%s] The 'prefix' option is deprecaded in favor " "of the 'name_sheme' option. The set pefix '%s' is " "converted to 'name_sheme: %s'" % (self, prefix, run_id_sheme)) try: _ = run_id_sheme % '' except TypeError as e: raise StepError( self, 'Could not parse name_sheme "%s": %s' % (run_id_sheme, e)) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id]['in/fastx'] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") else: is_gzipped = True if os.path.splitext(input_paths[0])[1]\ in ['.gz', '.gzip'] else False out = run.add_output_file("fastx", run_id_sheme % run_id + '.fastq.gz', input_paths) with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as pipe: # 1.1 command: Uncompress file if is_gzipped: pigz = [ self.get_tool('pigz'), '--decompress', '--processes', '1', '--stdout' ] pigz.extend(input_paths) pipe.add_command(pigz) else: cat = [self.get_tool('cat')] cat.extend(input_paths) pipe.add_command(cat) # 1. Run fastx for input file fastx_revcom = [ self.get_tool('fastx_reverse_complement') ] # gzip fastx_revcom.extend(['-z']) pipe.add_command(fastx_revcom, stdout_path=out)
def runs(self, run_ids_connections_files): options = [ 'max_width', 'bin_size', 'extrap', 'step', 'bootstraps', 'cval', 'terms', 'quick' ] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('-%s' % option) else: option_list.append('-%s' % option) option_list.append(str(self.get_option(option))) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] is_bam = True if os.path.splitext(input_paths[0])[1]\ in ['.bam'] else False is_bed = True if os.path.splitext(input_paths[0])[1]\ in ['.bed'] else False if input_paths == [None]: run.add_empty_output_connection("complexity_curve") run.add_empty_output_connection("future_yield") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") elif not is_bam and not is_bed: raise StepError( self, "Input file %s is niether BAM nor BED." % input_paths[0]) else: with run.new_exec_group() as gc_group: gc_extrap_out = run.add_output_file( 'future_genome_coverage', '%s_future_genome_coverage.txt' % run_id, input_paths) gc_extrap = [self.get_tool('preseq'), 'gc_extrap'] gc_extrap.extend(option_list) if is_bed: gc_extrap.append('-bed') gc_extrap.extend(['-o', gc_extrap_out, input_paths[0]]) gc_group.add_command(gc_extrap)
def declare_runs(self): regex = re.compile(self.get_option('group')) found_files = dict() # find files for path in glob.glob(os.path.abspath(self.get_option('pattern'))): match = regex.match(os.path.basename(path)) if match is None: raise StepError( self, "Couldn't match regex /%s/ to file %s." % (self.get_option('group'), os.path.basename(path))) sample_id_parts = [] if self.is_option_set_in_config('sample_id_prefix'): sample_id_parts.append(self.get_option('sample_id_prefix')) sample_id_parts += list(match.groups()) sample_id = '_'.join(sample_id_parts) if sample_id not in found_files: found_files[sample_id] = list() found_files[sample_id].append(path) # declare a run for every sample for run_id, paths in found_files.items(): with self.declare_run(run_id) as run: run.add_public_info("paired_end", self.get_option("paired_end")) for path in paths: run.add_output_file("raws", path, [])
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") else: is_gzipped = True if os.path.splitext(input_paths[0])[1]\ in ['.gz', '.gzip'] else False with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as pipe: # 1. command: Read file in 4MB chunks dd_in = [ self.get_tool('dd'), 'ibs=2M', 'if=%s' % input_paths[0] ] pipe.add_command(dd_in) # 1.1 command: Uncompress file to fifo if is_gzipped: pigz = [ self.get_tool('pigz'), '--decompress', '--processes', str(self.get_cores()), '--stdout' ] pipe.add_command(pigz) # 1.2 call samtools to handle also .bam files samtools_view = [ self.get_tool('samtools'), 'view', '-h', '-' ] pipe.add_command(samtools_view) # 2. command: Process sam file # create the names of the out connections logfile = run.add_output_file( 'log', '%s.discarded.sam' % run_id, input_paths) statsfile = run.add_output_file( 'stats', '%s.statistics.txt' % run_id, input_paths) outfile = run.add_output_file( 'alignments', '%s.reduced.sam' % run_id, input_paths) # construct cmd discard_cmd = [ self.get_tool('discardLargeSplitsAndPairs'), '--N_splits', self.get_option('N_splits'), '--M_mates', self.get_option('M_mates'), '--statsfile', statsfile, '--logfile', logfile, '-', outfile ] # execute cmd pipe.add_command(discard_cmd)
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): # Get input alignments input_paths = run_ids_connections_files[run_id]['in/alignments'] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") with self.declare_run(run_id) as run: for input_path in input_paths: basename = os.path.splitext( os.path.basename(input_path))[0] with run.new_exec_group().add_pipeline() as pipe: # Read input alignments dd = [ self.get_tool('dd'), 'ibs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path ] pipe.add_command(dd) # Assemble samtools stats command samtools = [self.get_tool('samtools'), 'stats'] pipe.add_command(samtools, stdout_path=run.add_output_file( 'stats', basename + '.bam.stats', input_path))
def runs(self, run_ids_connections_files): options = ['step', 'verbose', 'pe', 'hist', 'vals', 'seg_len'] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('-%s' % option) else: option_list.append('-%s' % option) option_list.append(str(self.get_option(option))) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] is_bam = True if os.path.splitext(input_paths[0])[1]\ in ['.bam'] else False is_bed = True if os.path.splitext(input_paths[0])[1]\ in ['.bed'] else False if input_paths == [None]: run.add_empty_output_connection("complexity_curve") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") elif not is_bam and not is_bed: raise StepError( self, "Input file %s is niether BAM nor BED." % input_paths[0]) else: with run.new_exec_group() as cc_group: c_curve_out = run.add_output_file( 'complexity_curve', '%s_complexity_output.txt' % run_id, input_paths) c_curve = [self.get_tool('preseq'), 'c_curve'] c_curve.extend(option_list) if is_bam: c_curve.append('-bam') c_curve.extend(['-o', c_curve_out, input_paths[0]]) cc_group.add_command(c_curve)
def runs(self, run_ids_connections_files): # found_files holds the runIDs and their related files found_files = dict() if self.is_option_set_in_config('group') and \ self.is_option_set_in_config('pattern'): regex = re.compile(self.get_option('group')) # find files matching the 'group' pattern in all files matching # 'pattern' for path in glob.glob(os.path.abspath(self.get_option('pattern'))): match = regex.match(os.path.basename(path)) if match is None: raise StepError(self, "Couldn't match regex /%s/ to file %s." % (self.get_option('group'), os.path.basename(path))) sample_id_parts = [] if self.is_option_set_in_config('sample_id_prefix'): sample_id_parts.append(self.get_option('sample_id_prefix')) sample_id_parts += list(match.groups()) sample_id = '_'.join(sample_id_parts) if sample_id not in found_files: found_files[sample_id] = list() found_files[sample_id].append(path) elif self.is_option_set_in_config('sample_to_files_map'): for run_id, paths in self.get_option( 'sample_to_files_map').items(): for path in paths: if not os.path.isfile(path): raise StepError(self, "[raw_file_source]: %s is no file. " "Please provide correct path." % path) if run_id not in found_files: found_files[run_id] = list() found_files[run_id] = paths else: raise StepError(self, "[raw_file_source]: Either 'group' AND 'pattern'" " OR 'sample_to_files_map' options have to be set. ") # declare a run for every sample for run_id, paths in found_files.items(): with self.declare_run(run_id) as run: for path in paths: run.add_output_file("raw", path, [])
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") else: is_gzipped = True if os.path.splitext(input_paths[0])[1]\ in ['.gz', '.gzip'] else False out = run.add_output_file( "report_rRNA", "%s_%s-rRNA_count.txt" % (run_id, 'R1'), input_paths) samtools = [self.get_tool('samtools'), 'view', '-S'] with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as pipe: # 1.1 command: Uncompress file to no f*****g fifo if is_gzipped: pigz = [ self.get_tool('pigz'), '--decompress', '--processes', '1', '--stdout' ] pigz.extend(input_paths) pipe.add_command(pigz) # 2. command: Convert to fastq samtools.append('-') else: samtools.extend(input_paths) pipe.add_command(samtools) # 3 save fastq file cuta = [self.get_tool('cut'), '-f', '2,3,4'] pipe.add_command(cuta) cutb = [self.get_tool('cut'), '-f', '1', '-d', '|'] pipe.add_command(cutb) grep = [self.get_tool('grep'), '-v', '*'] pipe.add_command(grep) cutc = [self.get_tool('cut'), '-f', '1', '-d', '_'] pipe.add_command(cutc) sorta = [self.get_tool('sort')] pipe.add_command(sorta) uniq = [self.get_tool('uniq'), '-c'] pipe.add_command(uniq) sortb = [self.get_tool('sort')] pipe.add_command(sortb, stdout_path=out)
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ "in/alignments"] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") else: is_gzipped = True if os.path.splitext(input_paths[0])[1]\ in ['.gz', '.gzip'] else False out = run.add_output_file( "first_read", "%s_%s-samto.fastq.gz" % (run_id, 'R1'), input_paths) with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as pipe: # 1.1 command: Uncompress file to no f*****g fifo if is_gzipped: pigz = [ self.get_tool('pigz'), '--decompress', '--processes', '1', '--stdout' ] pigz.extend(input_paths) pipe.add_command(pigz) # 2. command: Convert to fastq samtools = [self.get_tool('samtools'), 'fastq'] if self.is_option_set_in_config('f'): samtools.extend( ['-f', str(self.get_option('f'))]) if self.is_option_set_in_config('F'): samtools.extend( ['-F', str(self.get_option('F'))]) if self.is_option_set_in_config('addF'): samtools.extend( ['-F', str(self.get_option('addF'))]) samtools.append('-') pipe.add_command(samtools) # 3 save fastq file pigzc = [ self.get_tool('pigz'), '--processes', '2', '--fast', '-' ] pipe.add_command(pigzc, stdout_path=out)
def runs(self, run_ids_connections_files): for run_id in run_ids_connections_files.keys(): # Get the basename index_basename = "%s-%s" % ( self.get_option('index-basename'), run_id) with self.declare_run(index_basename) as run: with run.new_exec_group() as exec_group: refseq = run_ids_connections_files[run_id]['in/reference_sequence'] if refseq == [None]: raise StepError( self, "No reference sequence received.") if len(refseq) != 1: raise StepError( self, "Reference sequence is not a single file.") bwa_index = [self.get_tool('bwa'), 'index'] # Add index_basename bwa_index.extend(['-p', index_basename]) # Add reference sequence (a single file) bwa_index.append(refseq[0]) exec_group.add_command(bwa_index) run.add_output_file( 'bwa_index', '%s.amb' % index_basename, refseq) run.add_output_file( 'bwa_index', '%s.ann' % index_basename, refseq) run.add_output_file( 'bwa_index', '%s.bwt' % index_basename, refseq) run.add_output_file( 'bwa_index', '%s.pac' % index_basename, refseq) run.add_output_file( 'bwa_index', '%s.sa' % index_basename, refseq)
def _getFastFormat(self, fast_file, is_gzipped): required_file_extensions = [ '.fastq', '.fq', 'fnq', '.fasta', '.fa', '.fna' ] example_file = os.path.basename(fast_file) format_index = -2 if is_gzipped else -1 fast_format = '.' + example_file.split('.')[format_index] if fast_format not in required_file_extensions: raise StepError( self, "File %s does not end with any " "expected suffix (%s). Please fix that issue." % (fast_file, ' | '.join(required_file_extensions))) fast_char = fast_format[-1] return fast_char
def runs(self, run_ids_connections_files): subcommand = self.get_option('subcommand') for run_id in run_ids_connections_files.keys(): # Collect input_paths and labels for multiBamSummary input_paths = run_ids_connections_files[run_id]['in/alignments'] labels = list() for f in input_paths: if not f.endswith(".bam"): raise StepError(self, "Not a BAM file: %s" % f) if len(input_paths) > 1: labels.append("%s-%s" % (run_id, input_paths.index(f))) else: labels.append(run_id) with self.declare_run(run_id) as run: # Let's compile the command with run.new_exec_group() as multi_bam_summary_eg: # 1. multiBamSummary command multi_bam_summary = [ self.get_tool('multiBamSummary'), subcommand ] # Append list of input BAM files multi_bam_summary.append('--bamfiles') multi_bam_summary.extend(input_paths) # Append name of the output file multi_bam_summary.append('--outFileName') multi_bam_summary.append( run.add_output_file('read-coverage', '%s.npz' % run_id, input_paths)) # Append list of BED files for BED-file subcommand if subcommand == "BED-file": multi_bam_summary.append('--BED') multi_bam_summary.extend(self.get_option('bed-file')) # Append list of labels multi_bam_summary.append('--labels') multi_bam_summary.extend(labels) # Append number of processors multi_bam_summary.extend( ['--numberOfProcessors', str(self.get_cores())]) # Add multiBamSummary to execution group multi_bam_summary_eg.add_command(multi_bam_summary)
def runs(self, run_ids_connections_files): self.set_cores(self.get_option('cores')) annotation = None for run_id in run_ids_connections_files.keys(): if 'in/annotation' in run_ids_connections_files[run_id]: annotation = run_ids_connections_files[run_id][ 'in/annotation'][0] for run_id in run_ids_connections_files.keys(): if 'in/annotation' in run_ids_connections_files[run_id]: continue with self.declare_run(run_id) as run: counts = run_ids_connections_files[run_id]['in/counts'][0] tool_name = self.get_option('t') file_name = '%s-gene-abundance.tsv' % (run_id) run.add_output_file('counts', file_name, [counts]) cmd = [self.get_tool('tcount2gcount')] if self.is_option_set_in_config('m'): cmd.extend(['-m', os.path.abspath(self.get_option('m'))]) else: if annotation: cmd.extend(['-m', os.path.abspath(annotation)]) else: raise StepError( self, "%s no annotation give via config or connection" % run_id) if self.is_option_set_in_config('kallisto-extended'): cmd.append('--kallisto-extended') cmd.extend(['-i', counts, '-t', tool_name, '-o', file_name]) convert_exec_group = run.new_exec_group() convert_exec_group.add_command(cmd)
def runs(self, cc): # get a list of all read files we have to count sample_input_paths_dict = dict() reads_counts_files = dict() read_files = list() options = {'new_output_format': '-N', 'quality': '-Q'} option_list = list() for option in [ o for o in options.keys() if self.is_option_set_in_config(o) ]: if isinstance(self.get_option(option), bool) and \ self.get_option(option): option_list.append(options[option]) else: option_list.append(options[option]) option_list.append(str(self.get_option(option))) read_types = {'first_read': '_R1', 'second_read': '_R2'} for run_id in cc.keys(): cc.switch_run_id(run_id) with self.declare_run(run_id) as run: for read in read_types: if not cc.exists_connection_for_run(f"in/{read}"): continue connection = 'in/%s' % read input_paths = cc[run_id][connection] # Check for empty connections if input_paths == [None]: run.add_empty_output_connection("%s_quality_stats" % read) else: temp_fifos = list() exec_group = run.new_exec_group() for input_path in input_paths: temp_fifo = run.add_temporary_file( "fifo-%s" % os.path.basename(input_path)) temp_fifos.append(temp_fifo) mkfifo = [self.get_tool('mkfifo'), temp_fifo] exec_group.add_command(mkfifo) # 2. Output files to fifo if input_path.endswith('fastq.gz'): with exec_group.add_pipeline() as unzip_pipe: # 2.1 command: Read file in 'dd-blocksize' # chunks dd_in = [ self.get_tool('dd'), 'ibs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path ] # 2.2 command: Uncompress file to fifo pigz = [ self.get_tool('pigz'), '--decompress', '--processes', str(self.get_cores()), '--blocksize', self.get_option('pigz-blocksize'), '--stdout' ] # 2.3 Write file in 'dd-blocksize' chunks # to fifo dd_out = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % temp_fifo ] unzip_pipe.add_command(dd_in) unzip_pipe.add_command(pigz) unzip_pipe.add_command(dd_out) elif input_path.endswith('fastq'): # 2.1 command: Read file in 'dd-blocksize' chunks and # write to fifo in 'dd-blocksize' chunks dd_in = [ self.get_tool('dd'), 'bs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path, 'of=%s' % temp_fifo ] exec_group.add_command(dd_in) else: raise StepError( self, "File %s does not end with any " "expected suffix (fastq.gz or " "fastq). Please fix that issue.") # 3. Read data from fifos and check quality stats with exec_group.add_pipeline() as fastx_pipe: # 3.1 command: Read from ALL fifos cat = [self.get_tool('cat')] cat.extend(temp_fifos) # 3.2 command: Compute quality statistics fastx_qs_file = run.add_output_file( "%s_quality_stats" % read, "%s%s.fastq.quality.tsv" % (run_id, read_types[read]), input_paths) fastx_qs = [self.get_tool('fastx_quality_stats')] fastx_qs.extend(option_list) fastx_pipe.add_command(cat) fastx_pipe.add_command(fastx_qs, stdout_path=fastx_qs_file)
def runs(self, cc): flags = [ 'q', 'qseq', 'skip', 'f', 'c', 'ignore-quals', 'nofw', 'dta', 'norc', 'no-mixed', 'no-discordant', 'quiet', 'qc-filter', 'non-deterministic', 'no-temp-splicesite', 'no-softclip', 'no-spliced-alignment', 'tmo', 'no-head', 'no-sq', 'omit-sec-seq', 'remove-chrname', 'add-chrname', 'new-summary' ] strflags = [ 'n-ceil', 'ma', 'mp', 'sp', 'np', 'rdg', 'score-min', 'k', 'skip', 'rfg', 'rg', 'pen-cansplice', 'pen-noncansplice', 'pen-canintronlen', 'pen-noncanintronlen', 'min-intronlen', 'max-intronlen', 'known-splicesite-infile', 'minins', 'maxins', 'seed', 'trim5', 'trim3', 'novel-splicesite-outfile', 'novel-splicesite-infile' ] self.set_cores(self.get_option('cores')) # Check if option values are valid if not os.path.exists(self.get_option('index') + '.1.ht2'): raise StepError( self, "Could not find index file: %s.*" % self.get_option('index')) paired_end = cc.connection_exists('in/second_read') if not cc.all_runs_have_connection('in/first_read'): read_name = '' if paired_end else ' first' run_ids = list(cc.get_runs_without_any('in/first_read')) if len(run_ids) > 5: run_ids = run_ids[0:5] + ['...'] raise StepError( self, 'No%s read passed by runs ' '%s.' % (read_name, list(run_ids))) if paired_end and not cc.all_runs_have_connection('in/second_read'): run_ids = list(cc.get_runs_without_any('in/second_read')) if len(run_ids) > 5: run_ids = run_ids[0:5] + ['...'] raise StepError(self, 'No second read passed by runs ' '%s.' % run_ids) res = [(self.get_option('fr'), 'fr'), (self.get_option('rf'), 'rf'), (self.get_option('ff'), 'ff')] library_types = [flag for is_set, flag in res if is_set] if len(library_types) > 1: message = "too many stranded flags fr, rf, ff: %s" raise Exception(message % (res)) library_type = self.get_option('library_type') if paired_end and library_types and library_type: raise StepError( self, 'Option "library_type: %s" and the flag %s ' 'are set. Please specify only one.' % (library_type, library_types[0])) elif paired_end and library_types: library_type = library_types[0] elif not paired_end and library_types: raise StepError( self, 'Library type %s is specified for single ' 'end reads.' % library_types[0]) elif not paired_end and library_type: raise StepError( self, 'Library type %s is specified for single ' 'end reads.' % library_type) for run_id in cc.keys(): with self.declare_run(run_id) as run: # Get list of files for first/second read fr_input = cc[run_id]['in/first_read'][0] input_paths = [fr_input] is_paired_end = False if paired_end: sr_input = cc[run_id]['in/second_read'][0] input_paths.append(sr_input) with run.new_exec_group() as exec_group: with exec_group.add_pipeline() as hisat2_pipe: # Assemble hisat2 command hisat2 = [self.get_tool('hisat2')] for flag in flags: if self.get_option(flag) is True: if flag in ['q', 'f', 'r', 'c']: hisat2.extend(['-' + flag]) else: hisat2.extend(['--' + flag]) if paired_end: hisat2.append('--%s' % library_type) for flag in strflags: if self.is_option_set_in_config(flag): hisat2.extend( ['--' + flag, str(self.get_option(flag))]) # Leave 2 cores available for pigz compressing the # output. hisat2.extend([ '-p', str(self.get_option('cores') - 2), '-x', os.path.abspath(self.get_option('index')) ]) if paired_end: if self.get_option('rna-strandness') == 'F': hisat2.extend(['--rna-strandness', 'FR']) elif self.get_option('rna-strandness') == 'R': hisat2.extend(['--rna-strandness', 'RF']) hisat2.extend(['-1', fr_input, '-2', sr_input]) else: hisat2.extend(['-U', fr_input]) if not self.get_option('rna-strandness') == 'U': hisat2.extend([ '--rna-strandness', self.get_option('rna-strandness') ]) log_stderr = run.add_output_file( 'log_stderr', '%s-hisat2-log_stderr.txt' % run_id, input_paths) summary = run.add_output_file( 'summary', '%s-hisat2-summary.txt' % run_id, input_paths) hisat2.extend(['--summary-file', summary]) metrics = run.add_output_file( 'metrics', '%s-hisat2-metrics.txt' % run_id, input_paths) hisat2.extend(['--met-file', metrics]) if self.get_option('un-gz') is True: unaligned = run.add_output_file( 'unaligned', '%s-hisat2-unaligned.fastq.gz' % run_id, input_paths) hisat2.extend(['--un-gz', unaligned]) if self.get_option('al-gz') is True: aligned = run.add_output_file( 'aligned', '%s-hisat2-aligned.fastq.gz' % run_id, input_paths) hisat2.extend(['--al-gz', aligned]) hisat2_pipe.add_command(hisat2, stderr_path=log_stderr) res = run.add_output_file( 'alignments', '%s-hisat2-results.sam.gz' % run_id, input_paths) # Compress hisat2 output pigz = [self.get_tool('pigz'), '--stdout'] hisat2_pipe.add_command(pigz, stdout_path=res)
def runs(self, cc): self.set_cores(self.get_option('cores')) if self.get_option('config') and self.get_option('databases'): raise StepError(self, "A config file and databases are specified.") if not self.get_option('config') and not self.get_option('databases'): raise StepError(self, "No config file or databases are specified.") if self.get_option('config'): logger.warning( '[%s] Using a config file is deprecated. ' 'Please specify databases instead.' % self.get_step_name()) config_file = os.path.abspath(self.get_option('config')) else: conf_data = [ 'BOWTIE2 %s' % self.get_tool('bowtie2'), 'THREADS %d' % self.get_cores() ] for db in sorted(self.get_option('databases').items()): conf_data.append('DATABASE %s %s BOWTIE2' % db) for run_id in cc.keys(): run = self.declare_run(run_id) if not self.get_option('config'): if self.get_option('keep config'): config_file = run.add_output_file('fastq_screen.conf', 'fastq_screen.conf', []) else: config_file = run.add_temporary_file('fastq_screen.conf') write_conf = [self.get_tool('printf'), '\n'.join(conf_data)] execg = run.new_exec_group() execg.add_command(write_conf, stdout_path=config_file) for input_path in cc[run_id]['in/first_read']: file_name = os.path.basename(input_path).rstrip(".fastq.gz") # prepare output files file_pattern = "%s_screen.txt" % (file_name) run.add_output_file("fqc_report", file_pattern, [input_path]) file_pattern = "%s_screen.png" % (file_name) run.add_output_file("fqc_image", file_pattern, [input_path]) file_pattern = "%s_screen.html" % (file_name) run.add_output_file("fqc_html", file_pattern, [input_path]) file_pattern = "%s-fastqscreen-log_stdout.txt" % (file_name) log_stdout = run.add_output_file("log_stdout", file_pattern, [input_path]) file_pattern = "%s-fastqscreen-log_stderr.txt" % (file_name) log_stderr = run.add_output_file("log_stderr", file_pattern, [input_path]) # build fastq_screen command fastq_screen_exec_group = run.new_exec_group() fastq_screen = [self.get_tool('fastq_screen'), '-conf', config_file] if self.get_option('subset'): fastq_screen.extend(['--subset', str(self.get_option('subset'))]) if self.get_option('nohits'): file_pattern = "%s.tagged.fastq.gz" % (file_name) run.add_output_file("tagged", file_pattern, [input_path]) file_pattern = "%s.tagged_filter.fastq.gz" % (file_name) run.add_output_file("tagged_filter", file_pattern, [input_path]) fastq_screen.extend(['--nohits']) fastq_screen.extend(['--outdir', '.', input_path]) fastq_screen_exec_group.add_command( fastq_screen, stdout_path=log_stdout, stderr_path=log_stderr)
def runs(self, run_ids_connections_files): # Compile the list of options # List of options common for bin and BED-file subcommand options = [ 'outFileFormat', 'scaleFactorsMethod', 'sampleLength', 'numberOfSamples', 'scaleFactors', 'ratio', 'pseudocount', 'binSize', 'region', 'blackListFileName', 'normalizeTo1x', 'normalizeUsingRPKM', 'ignoreForNormalization', 'skipNonCoveredRegions', 'smoothLength', 'extendReads', 'ignoreDuplicates', 'minMappingQuality', 'centerReads', 'samFlagInclude', 'samFlagExclude', 'minFragmentLength', 'maxFragmentLength' ] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('--%s' % option) else: option_list.append('--%s' % option) option_list.append(str(self.get_option(option))) # List of sample lists = losl losl = self.get_option('samples') # Test the user input and connection data for validity for samples in losl: if len(samples) != 2: raise StepError( self, "Expected exactly two samples. Received %s (%s)" % (len(samples), ", ".join(samples))) input_paths = list() for sample in samples: try: files = run_ids_connections_files[sample]['in/alignments'] except KeyError as e: raise StepError( self, 'No files found for sample %s and connection ' '"in/alignments". Please check your configuration.' % sample) if not len(files) == 1 or not files[0].endswith('.bam'): raise StepError( self, "Expected exactly one BAM file, got %s" % ", ".join(files)) # Add found BAM file to input paths input_paths.append(files[0]) # Assemble new run name from input sample names run_id = "%s-%s" % (samples[0], samples[1]) # Start defining the run here: with self.declare_run(run_id) as run: # Add output file here: outfile = str() if self.get_option('outFileFormat') == "bigwig": outfile = run.add_output_file('ucsc-tracks', '%s.bw' % run_id, input_paths) elif self.get_option('outFileFormat') == "bedgraph": outfile = run.add_output_file('ucsc-tracks', '%s.bg' % run_id, input_paths) # Let's compile the command with run.new_exec_group() as bam_compare_eg: # 1. bamCompare command bam_compare = [ self.get_tool('bamCompare'), '--bamfile1', input_paths[0], '--bamfile2', input_paths[1], '--outFileName', outfile ] # Append number of processors bam_compare.extend( ['--numberOfProcessors', str(self.get_cores())]) # Append list of options bam_compare.extend(option_list) bam_compare_eg.add_command(bam_compare)
def runs(self, cc): read_types = {'first_read': 'R1'} paired_end = cc.connection_exists('in/second_read') if not cc.all_runs_have_connection('in/first_read'): read_name = '' if paired_end else ' first' run_ids = list(cc.get_runs_without_any('in/first_read')) if len(run_ids) > 5: run_ids = run_ids[0:5] + ['...'] raise StepError(self, '[cutadapt] No%s read passed by runs ' '%s.' % (read_name, list(run_ids))) if paired_end: if not cc.all_runs_have_connection('in/second_read'): read_name = ' second' run_ids = list(cc.get_runs_without_any('in/second_read')) if len(run_ids) > 5: run_ids = run_ids[0:5] + ['...'] raise StepError(self, '[cutadapt] No%s read passed by runs ' '%s.' % (read_name, list(run_ids))) read_types['second_read'] = 'R2' options = [ "error-rate", "no-indels", "times", "overlap", "match-read-wildcards", "discard-trimmed", "discard-untrimmed", "minimum-length", "maximum-length", "no-trim", "mask-adapter", "cut", "quality-cutoff", "quality-base", "prefix", "suffix", "strip-suffix", "colospace", "double-encode", "trim-primer", "strip-f3", "maq", "bwa", "length-tag", "no-zero-cap", "zero-cap"] set_options = [option for option in options if self.is_option_set_in_config(option)] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('--%s' % option) else: option_list.append('--%s' % option) option_list.append(str(self.get_option(option))) for run_id in cc.keys(): run = self.declare_run(run_id) for read in read_types: connection = 'in/%s' % read input_paths = cc[run_id][connection] # make sure that adapter-R1/adapter-R2 or adapter-file are # correctly set # this kind of mutual exclusive option checking is a bit # tedious, so we do it here. if read == 'second_read': if (not self.is_option_set_in_config('adapter-R2') and not self.is_option_set_in_config('adapter-file')): raise StepError( self, "Option 'adapter-R2' or 'adapter-file' " "required because sample %s is paired end!" % run_id) if (self.is_option_set_in_config('adapter-file') and self.is_option_set_in_config('adapter-R1')): raise StepError(self, "Option 'adapter-R1' and 'adapter-file' " "are both set but are mutually exclusive!") if (not self.is_option_set_in_config('adapter-file') and not self.is_option_set_in_config('adapter-R1')): raise StepError(self, "Option 'adapter-R1' or 'adapter-file' " "required to call cutadapt for sample %s!" % run_id) temp_fifos = list() exec_group = run.new_exec_group() for input_path in input_paths: # 1. Create temporary fifo for every input file temp_fifo = run.add_temporary_file( "fifo-%s" % os.path.basename(input_path)) temp_fifos.append(temp_fifo) mkfifo = [self.get_tool('mkfifo'), temp_fifo] exec_group.add_command(mkfifo) # 2. Output files to fifo if input_path.endswith('fastq.gz'): with exec_group.add_pipeline() as pigz_pipe: # 2.1 command: Read file in 4MB chunks dd_in = [ self.get_tool('dd'), 'ibs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path ] # 2.2 command: Uncompress file to fifo pigz = [self.get_tool('pigz'), '--decompress', '--processes', str(self.get_cores()), '--blocksize', self.get_option('pigz-blocksize'), '--stdout'] # 2.3 command: Write file in 4MB chunks to # fifo dd_out = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % temp_fifo ] pigz_pipe.add_command(dd_in) pigz_pipe.add_command(pigz) pigz_pipe.add_command(dd_out) elif input_path.endswith('fastq'): # 2.1 command: Read file in 4MB chunks and # write to fifo in 4MB chunks dd_in = [ self.get_tool('dd'), 'bs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path, 'of=%s' % temp_fifo ] exec_group.add_command(dd_in) else: raise StepError(self, "File %s does not end with any " "expected suffix (fastq.gz or " "fastq). Please fix that issue.") # 3. Read data from fifos with exec_group.add_pipeline() as cutadapt_pipe: # 3.1 command: Read from ALL fifos cat = [self.get_tool('cat')] cat.extend(temp_fifos) cutadapt_pipe.add_command(cat) # 3.2 command: Fix qnames if user wants us to if self.get_option('fix_qnames'): fix_qnames = [self.get_tool('fix_qnames')] cutadapt_pipe.add_command(fix_qnames) # Let's get the correct adapter sequences or # adapter sequence fasta file adapter = None # Do we have adapter sequences as input? if self.is_option_set_in_config('adapter-%s' % read_types[read]): # Get adapter sequence adapter = self.get_option( 'adapter-%s' % read_types[read]) # add index to adapter sequence if necessary if '((INDEX))' in adapter: index = self.find_upstream_info_for_input_paths( input_paths, 'index-%s' % read_types[read]) adapter = adapter.replace('((INDEX))', index) # create reverse complement if necessary if self.get_option('use_reverse_complement'): complements = adapter.maketrans('acgtACGT', 'tgcaTGCA') adapter = adapter.translate(complements)[::-1] # make sure the adapter is looking good if re.search(r'^[ACGT]+$', adapter) is None: raise StepError(self, "Unable to come up with a " "legit-looking adapter: %s" % adapter) # Or do we have a adapter sequence fasta file? elif self.is_option_set_in_config('adapter-file'): adapter_file = os.path.abspath(self.get_option('adapter-file')) adapter = "file:" + adapter_file if not os.path.exists(adapter_file): raise StepError( self, "File %s containing adapter sequences " "does not exist." % self.get_option('adapter-file')) # 3.3 command: Clip adapters cutadapt = [self.get_tool('cutadapt'), self.get_option('adapter-type'), adapter, '-'] cutadapt.extend(option_list) cutadapt_log_file = run.add_output_file( 'log_%s' % read, '%s-cutadapt-%s-log.txt' % (run_id, read_types[read]), input_paths) # 3.4 command: Compress output pigz = [self.get_tool('pigz'), '--processes', str(self.get_cores()), '--blocksize', self.get_option('pigz-blocksize'), '--stdout'] # 3.5 command: Write to output file in 4MB chunks clipped_fastq_file = run.add_output_file( "%s" % read, "%s_%s.fastq.gz" % (run_id, read_types[read]), input_paths) dd = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % clipped_fastq_file ] cutadapt_pipe.add_command(cutadapt, stderr_path=cutadapt_log_file) cutadapt_pipe.add_command(pigz) cutadapt_pipe.add_command(dd)
def runs(self, run_ids_connections_files): # Compile the list of options options = [ 'file-format', 'normalization', 'peaktype', 'shiftsize', 'threshold', 'windowsize' ] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('--%s' % option) else: option_list.append('--%s' % option) option_list.append(str(self.get_option(option))) # Get the essential dictionary with information about the relationship # between Input and ChIP samples chip_vs_input = self.get_option('chip_vs_input') # the highest level keys of the dict are the new runID for run_id in chip_vs_input.keys(): in_files = dict() config_to_option = dict() # Are we going to perform differential peak calling yes or no? if not self.get_option('diff'): # If not we only use chip1 and input1 config_to_option = {'rep1': 'chip1', 'inputs1': 'input1'} else: # Else we require chip1+input1 and chip2+input2 config_to_option = { 'rep1': 'chip1', 'inputs1': 'input1', 'rep2': 'chip2', 'inputs2': 'input2' } # Check the input from the chip_vs_input dict for key, opt in config_to_option.items(): experiment = chip_vs_input[run_id] in_files[opt] = list() try: # in_run_id: run ID whose in/alignments files # are used for pepr's --[chip[12]|input[12]] for in_run_id in experiment[key]: in_files[opt].extend( run_ids_connections_files[in_run_id] ['in/alignments']) if run_ids_connections_files[in_run_id][ 'in/alignments'] == [None]: raise StepError( self, "Upstream run %s provides no " "alignments for run %s" % (in_run_id, run_id)) except KeyError as e: raise StepError( self, "Required key %s missing in 'chip_vs_input' " "for run %s" % (key, run_id)) # Create a new run named run_id with self.declare_run(run_id) as run: # Assemble list of all input files input_paths = [f for k in in_files for f in in_files[k]] # result_files dict: # keys = temporary file names # values = final file names result_files = dict() # Is differential peak calling happening? if self.get_option('diff'): # If yes we do not get any normal peaks run.add_empty_output_connection("peaks") # but we do get two peak lists with differential peaks chip1_file = '%s__PePr_chip1_peaks.bed' % run_id result_files[chip1_file] = run.add_output_file( 'differential_peaks', chip1_file, input_paths) chip2_file = '%s__PePr_chip2_peaks.bed' % run_id result_files[chip2_file] = run.add_output_file( 'differential_peaks', chip2_file, input_paths) else: # If no we do not get any differential_peaks run.add_empty_output_connection("differential_peaks") # but we do get a peak file peaks_file = '%s__PePr_peaks.bed' % run_id result_files[peaks_file] = run.add_output_file( 'peaks', peaks_file, input_paths) # parameter file used to run PePr with parameter_file = '%s__PePr_parameters.txt' % run_id result_files[parameter_file] = run.add_output_file( 'parameter', parameter_file, input_paths) # temp_dir holds temporary directory path temp_dir = str() with run.new_exec_group() as pepr_exec_group: # 1. Create temporary directory for PePr output temp_dir = 'pepr-out' mkdir = [self.get_tool('mkdir'), temp_dir] pepr_exec_group.add_command(mkdir) # 2. Compile the PePr command pepr = [ self.get_tool('pepr'), '--output-directory', temp_dir, '--file-format', self.get_option('file-format'), '--name', run_id ] # Add '--[chip[12]|input[12]]' and comma separated list of # alignment files for opt in in_files.keys(): pepr.append('--%s' % opt) pepr.append(','.join(in_files[opt])) if self.get_option('diff'): pepr.append('--diff') # Add additional options pepr.extend(option_list) pepr_exec_group.add_command(pepr) with run.new_exec_group() as mv_exec_group: for orig, dest_path in result_files.items(): # 3. Move file from temp directory to expected # position orig_path = os.path.join(temp_dir, orig) mv = [self.get_tool('mv'), orig_path, dest_path] mv_exec_group.add_command(mv) with run.new_exec_group() as tar_exec_group: # log_file = run.add_output_file( 'log', '%s__PePr_debug_log.tar.gz' % run_id, input_paths) # We need to compress the temp directory (which should only # contain the log file) and delete all files in there tar = [ self.get_tool('tar'), '--create', '--gzip', '--verbose', '--remove-files', '--file=%s' % log_file, temp_dir ] tar_exec_group.add_command(tar)
def runs(self, run_ids_connections_files): options = [ # Standard Picard Options: 'TMP_DIR', 'VERBOSITY', 'QUIET', 'VALIDATION_STRINGENCY', 'COMPRESSION_LEVEL', 'MAX_RECORDS_IN_RAM', 'CREATE_INDEX', 'CREATE_MD5_FILE', 'REFERENCE_SEQUENCE', 'GA4GH_CLIENT_SECRETS', # Picard MarkDuplicates Options: 'SORT_ORDER', 'RGID', 'RGLB', 'RGPL', 'RGPU', 'RGCN', 'RGDS', 'RGDT', 'RGPI', 'RGPG', 'RGPM' ] file_options = ['TMP_DIR', 'REFERENCE_SEQUENCE'] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('%s=true' % option) else: option_list.append('%s=false' % option) else: value = str(self.get_option(option)) if option in file_options: value = os.path.abspath(value) option_list.append('%s=%s' % (option, value)) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ 'in/alignments'] if input_paths == [None]: run.add_empty_output_connection("alignments") elif len(input_paths) != 1: raise StepError(self, "Expected exactly one alignments file.") elif os.path.splitext( input_paths[0])[1] not in ['.sam', '.bam']: raise StepError( self, "The file %s seems not to be a SAM or BAM file. At " "least the suffix is wrong." % input_paths[0]) else: with run.new_exec_group() as exec_group: alignments = run.add_output_file( 'alignments', os.path.basename(input_paths[0]), input_paths) add_replace_read_groups = [ self.get_tool('picard-tools'), 'AddOrReplaceReadGroups', 'INPUT=%s' % input_paths[0], 'OUTPUT=%s' % alignments, 'RGSM=%s' % run_id ] add_replace_read_groups.extend(option_list) exec_group.add_command(add_replace_read_groups)
def runs(self, run_ids_connections_files): # Compile the list of options options = [ 'histogram', 'maxFragmentLength', 'logScale', 'binSize', 'distanceBetweenBins', 'blackListFileName' ] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('--%s' % option) else: option_list.append('--%s' % option) option_list.append(str(self.get_option(option))) def declare_bamPEFragmentSize(run_id, input_paths, labels): with self.declare_run(run_id) as run: # Let's compile the command with run.new_exec_group() as bamPEFragmentSize_eg: # 1. bamPEFragmentSize command bamPEFragmentSize = [ self.get_tool('bamPEFragmentSize'), '--numberOfProcessors', self.get_cores(), '--bamfiles' ] bamPEFragmentSize.extend(input_paths) # Set options for plot creation if self.is_option_set_in_config('histogram'): bamPEFragmentSize.append('--histogram') bamPEFragmentSize.append( run.add_output_file('fragment_size_plots', '%s.png' % run_id, input_paths)) bamPEFragmentSize.append('--plotTitle') bamPEFragmentSize.append(run_id) bamPEFragmentSize.append('--samplesLabel') bamPEFragmentSize.extend(labels) if self.is_option_set_in_config('logScale'): bamPEFragmentSize.append('--logScale') # Append list of options bamPEFragmentSize.extend(option_list) bamPEFragmentSize_eg.add_command( bamPEFragmentSize, stdout_path=run.add_output_file( 'fragment_size_stats', '%s-PEFragmentSize.stats' % run_id, input_paths)) run_id = str() input_paths = list() labels = list() if self.is_option_set_in_config('samples'): runIds_samples = self.get_option('samples') for run_id, samples in runIds_samples.items(): if not isinstance(run_id, str): raise StepError( self, "Not a string run ID (%s) for samples (%s)" % (run_id, ", ".join(samples))) if not isinstance(samples, list): raise StepError( self, "Not a list of samples. Type: %s, Value: %s" % (type(samples), samples)) for sample in samples: try: bam_files = run_ids_connections_files[sample][ 'in/alignments'] except KeyError: raise StepError(self, "No input sample named %s" % sample) for i in range(len(bam_files)): if not bam_files[i].endswith(".bam"): raise StepError( self, "Not a BAM file: %s" % bam_files[i]) input_paths.append(bam_files[i]) if i > 0: labels.append("%s-%s" % (sample, i)) else: labels.append(sample) # Start declaring the command declare_bamPEFragmentSize(run_id, input_paths, labels) else: for run_id in run_ids_connections_files.keys(): try: input_paths = run_ids_connections_files[run_id][ 'in/alignments'] except KeyError: raise StepError( self, 'No files found for run-id %s and connection ' '"in/alignments". Please check your configuration.' % run_id) for f in input_paths: label = run_id if len(input_paths) > 1: label = "%s-%s" % (run_id, input_paths.index(f)) declare_bamPEFragmentSize(run_id, f, [label])
def runs(self, run_ids_connections_files): options = [ # Standard Picard Options: 'TMP_DIR', 'VERBOSITY', 'QUIET', 'VALIDATION_STRINGENCY', 'COMPRESSION_LEVEL', 'MAX_RECORDS_IN_RAM', 'CREATE_INDEX', 'CREATE_MD5_FILE', 'REFERENCE_SEQUENCE', 'GA4GH_CLIENT_SECRETS', # Picard MarkDuplicates Options: 'SORT_ORDER', 'ASSUME_SORTED', 'MERGE_SEQUENCE_DICTIONARIES', 'USE_THREADING', 'COMMENT', 'INTERVALS' ] file_options = ['TMP_DIR', 'REFERENCE_SEQUENCE'] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('%s=true' % option) else: option_list.append('%s=false' % option) else: value = str(self.get_option(option)) if option in file_options: value = os.path.abspath(value) option_list.append('%s=%s' % (option, value)) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_paths = run_ids_connections_files[run_id][ 'in/alignments'] if input_paths == [None]: run.add_empty_output_connection("alignments") elif os.path.splitext( input_paths[0])[1] not in ['.sam', '.bam']: raise StepError( self, "The file %s seems not to be a SAM or BAM file. At " "least the suffix is wrong." % input_paths[0]) elif self.is_option_set_in_config("INTERVALS") and \ not os.path.exists(self.get_option("INTERVALS")): raise StepError( self, "The path %s given to option 'INTERVALS' is " "not pointing to a file.") elif len(input_paths) == 0: run.add_empty_output_connection("alignments") elif len(input_paths) == 1: base = os.path.basename(input_paths[0]) with run.new_exec_group() as ln_alignment: # 1. command: Create symbolic link to original bam file # (use absolute path) ln = [ self.get_tool('ln'), '-s', input_paths[0], run.add_output_file('alignments', base, input_paths) ] ln_alignment.add_command(ln) else: with run.new_exec_group() as exec_group: alignments = run.add_output_file( 'alignments', '%s-merged.bam' % run_id, input_paths) merge_sam_files = [ self.get_tool('picard-tools'), 'MergeSamFiles' ] for f in input_paths: merge_sam_files.append('INPUT=%s' % f) merge_sam_files.append('OUTPUT=%s' % alignments) merge_sam_files.extend(option_list) exec_group.add_command(merge_sam_files)
def runs(self, run_ids_connections_files): options = [ 'b', 'color', 'd', 'e', 'h', 'holdcolumnorder', 'init', 'l', 'm', 'nobed', 'nobrowser', 'noenrich', # 'printposterior', 'printstatebyline', 'r', 's', 'stateordering', 't', 'x', 'z' ] file_options = ['assembly', 'l', 'm'] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): # Only set option if it is True if self.get_option(option): option_list.append('-%s' % option) else: value = str(self.get_option(option)) if option in file_options: value = os.path.abspath(value) option_list.append('-%s' % option) option_list.append(value) for run_id in run_ids_connections_files.keys(): # The input_paths should be a single tar.gz file input_paths = run_ids_connections_files[run_id][ 'in/chromhmm_binarization'] # Test the input_paths (at least a bit) if len(input_paths) != 1 or not input_paths[0].endswith('.tar.gz'): raise StepError( self, "Expected single tar.gz file via " "'in/chromhmm_binarization' for run %s, but got " "this %s" % (run_id, ", ".join(input_paths))) # read tar file and get names of included files # with tarfile.open(name = input_paths[0], mode = 'r:gz') as tar: # tar.list() with self.declare_run(run_id) as run: with run.new_exec_group() as pre_chromhmm: # 1. Extract the binary files into a directory # 1.1 Get name of temporary input directory input_dir = run.add_temporary_directory('%s_binary_files' % run_id) # 1.2 Create temporary input directory mkdir = [self.get_tool('mkdir'), input_dir] pre_chromhmm.add_command(mkdir) # 1.3 Extract the binary files into temporary input # directory tar = [ self.get_tool('tar'), '--extract', '--gzip', '--verbose', '--directory', input_dir, '--file', input_paths[0] ] pre_chromhmm.add_command(tar) # 1.4 Get name of temporary output directory output_dir = run.add_temporary_directory( '%s_chromhmm_model' % run_id) # 1.5 Create temporary output directory mkdir = [self.get_tool('mkdir'), output_dir] pre_chromhmm.add_command(mkdir) with run.new_exec_group() as learnmodel: # 2. Assemble ChromHMM LearnModel command chromhmm = [self.get_tool('ChromHMM'), 'LearnModel'] chromhmm.extend(option_list) chromhmm.append(input_dir) chromhmm.append(output_dir) chromhmm.append(str(self.get_option('numstates'))) chromhmm.append( os.path.abspath(self.get_option('assembly'))) learnmodel.add_command(chromhmm) with run.new_exec_group() as pack_model: # 3. Pack the output files of ChromHMM LearnModel with pack_model.add_pipeline() as pack_model_pipe: # 3.1 List content of output directory ls = [self.get_tool('ls'), '-1', output_dir] # 3.2 Pipe ls output pack_model_pipe.add_command(ls) # 3.3 Use xargs to call tar (circumventing glob # pattern) xargs = [ self.get_tool('xargs'), '--delimiter', '\n', self.get_tool('tar'), '--create', '--directory', output_dir, '--gzip', '--remove-files', '--verbose', '--file', run.add_output_file( 'chromhmm_model', '%s_model_files.tar.gz' % run_id, input_paths) ] pack_model_pipe.add_command(xargs) with run.new_exec_group() as rm_binary_files: # 4. Remove the unpacked binary files with rm_binary_files.add_pipeline() as rm_binary_pipe: # 4.1 List content of output directory ls = [self.get_tool('ls'), '-1', input_dir] # 4.2 Pipe ls output rm_binary_pipe.add_command(ls) # 4.3 Use xargs to call tar (circumventing glob # pattern) xargs = [ self.get_tool('xargs'), '--delimiter', '\n', '-I', '*', self.get_tool('rm'), '--verbose', os.path.join(input_dir, '*') ] rm_binary_pipe.add_command(xargs)
def runs(self, run_ids_connections_files): # Check if index is valid if not os.path.exists(self.get_option('index') + '.bwt'): raise StepError( self, "Could not find index: %s.*" % self.get_option('index')) # Compile the list of options options = [ # [Algorithm options:] 't', 'k', 'w', 'd', 'r', 'y', 'c', 'D', 'W', 'm', 'S', 'P', 'e', # [Scoring options:] 'A', 'B', 'O', 'E', 'L', 'U', 'x', # [Input/output options:] 'p', 'R', 'H', 'j', 'v', 'T', 'h', 'a', 'C', 'V', 'Y', 'M' ] set_options = [ option for option in options if self.is_option_set_in_config(option) ] option_list = list() for option in set_options: if isinstance(self.get_option(option), bool): if self.get_option(option): option_list.append('-%s' % option) else: option_list.append('-%s' % option) option_list.append(str(self.get_option(option))) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: # Get list of files for first/second read fr_input = run_ids_connections_files[run_id]['in/first_read'] sr_input = run_ids_connections_files[run_id]['in/second_read'] input_paths = [ y for x in [fr_input, sr_input] for y in x if y is not None ] # Do we have paired end data and is it exactly one ? is_paired_end = False if sr_input == [None] else True # Fail if we have don't have exactly one file or # an empty connection if len(fr_input) != 1 or fr_input == [None]: raise StepError( self, "Expected single input file for first read.") # Fail if we don't have exactly one file if is_paired_end and len(sr_input) != 1: raise StepError( self, "Expected single input file for second read.") input_paths = fr_input # single element list if is_paired_end: input_paths.extend(sr_input) # Check file endings for proper type for input_path in input_paths: if len([ _ for _ in ['fastq', 'fq', 'fq.gz', 'fastq.gz'] if input_path.endswith(_) ]) != 1: raise StepError( self, "%s possess unknown suffix. " "(None of: fastq, fq, fq.gz, fastq.gz)") # BWA can handle only single files for first and second read # IMPORTANT: BWA handles gzipped as well as not gzipped files with run.new_exec_group() as exec_group: def prepare_input(input_path, exec_group): # Create temporary fifo temp_fifo = run.add_temporary_file( 'in-fifo-%s' % os.path.basename(input_path)) mkfifo = [self.get_tool('mkfifo'), temp_fifo] exec_group.add_command(mkfifo) dd = [ self.get_tool('dd'), 'bs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path, 'of=%s' % temp_fifo ] exec_group.add_command(dd) return (exec_group, temp_fifo) # Temporary fifos temp_fr_fifo, temp_sr_fifo = (str, str) exec_group, temp_fr_fifo = prepare_input( fr_input[0], exec_group) # And if we handle paired end data if is_paired_end: exec_group, temp_sr_fifo = prepare_input( sr_input[0], exec_group) # 3. Map reads using bwa mem with exec_group.add_pipeline() as bwa_mem_pipe: # Assemble bwa mem command bwa_mem = [ self.get_tool('bwa'), 'mem', ] bwa_mem.extend(option_list) bwa_mem.append(self.get_option('index')) bwa_mem.append(temp_fr_fifo) if is_paired_end: bwa_mem.append(temp_sr_fifo) bwa_mem_pipe.add_command(bwa_mem) # Compress bwa mem output pigz = [self.get_tool('pigz'), '--stdout'] bwa_mem_pipe.add_command(pigz) # Write bowtie2 output to file dd = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % run.add_output_file( 'alignments', '%s-bwa-mem.sam.gz' % run_id, input_paths) ] bwa_mem_pipe.add_command(dd)
def runs(self, run_ids_connections_files): self.set_cores(self.get_option('cores')) for run_id in run_ids_connections_files.keys(): with self.declare_run(run_id) as run: input_fileset = [] r1 = run_ids_connections_files[run_id]['in/first_read'][0] input_fileset.append(r1) r2 = None if 'in/second_read' in run_ids_connections_files[run_id]: r2 = run_ids_connections_files[run_id]['in/second_read'][0] input_fileset.append(r2) star = [self.get_tool('star')] # get genomeDir from config or from input files if self.is_option_set_in_config('genomeDir'): genome_dir = os.path.abspath( str(self.get_option('genomeDir'))) else: if 'in/genome_dir' not in run_ids_connections_files[ run_id]: raise StepError( self, 'Required parameter "GenomDir" wasnt found!') genome_dir = run_ids_connections_files[run_id][ 'in/genome_dir'][0] star.extend(['--genomeDir', genome_dir]) star.extend(['--outFileNamePrefix', './']) if self.is_option_set_in_config('readFilesCommand'): star.extend([ '--readFilesCommand', self.get_option('readFilesCommand') ]) if self.is_option_set_in_config('cores'): star.extend( ['--runThreadN', str(self.get_option('runThreadN'))]) star.append('--readFilesIn') star.extend(input_fileset) stderr_file = "%s-star-log_stderr.txt" % (run_id) log_stderr = run.add_output_file("log_stderr", stderr_file, input_fileset) stdout_file = "%s-star-log_stdout.txt" % (run_id) log_stdout = run.add_output_file("log_stdout", stdout_file, input_fileset) run.add_output_file("aligned", "Aligned.out.sam", input_fileset) run.add_output_file("log.final", "Log.final.out", input_fileset) run.add_output_file("log.out", "Log.out", input_fileset) run.add_output_file("log.progess", "Log.progress.out", input_fileset) run.add_output_file("sj.out", "SJ.out.tab", input_fileset) star_eg = run.new_exec_group() star_eg.add_command(star, stdout_path=log_stdout, stderr_path=log_stderr)
def runs(self, cc): read_types = {'first_read': '_R1', 'second_read': '_R2'} for run_id in cc.keys(): cc.switch_run_id(run_id) with self.declare_run(run_id) as run: for read in read_types: if not cc.exists_connection_for_run(f'in/{read}'): continue connection = 'in/%s' % read input_paths = cc[run_id][connection] if input_paths == [None]: run.add_empty_output_connection("%s" % read) else: temp_fifos = list() exec_group = run.new_exec_group() for input_path in input_paths: # Gzipped files are unpacked first # !!! Might be worth a try to use fifos instead of # temp files!!! # 1. Create temporary fifo temp_fifo = run.add_temporary_file( "fifo-%s" % os.path.basename(input_path)) temp_fifos.append(temp_fifo) mkfifo = [self.get_tool('mkfifo'), temp_fifo] exec_group.add_command(mkfifo) is_gzipped = True if os.path.splitext(input_path)[1]\ in ['.gz', '.gzip'] else False # 2. Output files to fifo if is_gzipped: with exec_group.add_pipeline() as unzip_pipe: # 2.1 command: Read file in 'dd-blocksize' # chunks dd_in = [ self.get_tool('dd'), 'ibs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path ] # 2.2 command: Uncompress file to fifo pigz = [self.get_tool('pigz'), '--processes', str(self.get_cores()), '--decompress', '--blocksize', self.get_option('pigz-blocksize'), '--stdout'] # 2.3 Write file in 'dd-blocksize' chunks # to fifo dd_out = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % temp_fifo ] unzip_pipe.add_command(dd_in) unzip_pipe.add_command(pigz) unzip_pipe.add_command(dd_out) elif os.path.splitext(input_path)[1] in\ ['.fastq', '.fq']: # 2.1 command: Read file in 'dd-blocksize' chunks and # write to fifo in 'dd-blocksize' chunks dd_in = [ self.get_tool('dd'), 'bs=%s' % self.get_option('dd-blocksize'), 'if=%s' % input_path, 'of=%s' % temp_fifo ] exec_group.add_command(dd_in) else: raise StepError( self, "File %s does not end with any " "expected suffix (fastq.gz or " "fastq). Please fix that issue." % input_path) # 3. Read data from fifos with exec_group.add_pipeline() as pigz_pipe: # 3.1 command: Read from ALL fifos cat = [self.get_tool('cat')] cat.extend(temp_fifos) pigz_pipe.add_command(cat) # 3.2 Gzip output file # if self.get_option('compress-output'): pigz = [self.get_tool('pigz'), '--processes', str(self.get_cores()), '--blocksize', self.get_option('pigz-blocksize'), '--stdout'] pigz_pipe.add_command(pigz) # 3.3 command: Write to output file in # 'dd-blocksize' chunks stdout_path = run.add_output_file( "%s" % read, "%s%s.fastq.gz" % (run_id, read_types[read]), input_paths) dd = [ self.get_tool('dd'), 'obs=%s' % self.get_option('dd-blocksize'), 'of=%s' % stdout_path ] pigz_pipe.add_command(dd)
def runs(self, run_ids_connections_files): isset_n = self.is_option_set_in_config('n') isset_p = self.is_option_set_in_config('p') if isset_n and isset_p: raise StepError( self, "Option n AND p are set in config.yaml. " "Only one is allowed.") config_options = self.get_options() read_types = {'first_read': '_R1', 'second_read': '_R2'} for run_id in run_ids_connections_files.keys(): new_run_id = run_id # create new run id if option o isset if self.is_option_set_in_config('o'): new_run_id = config_options['o'] + '_' + run_id with self.declare_run(new_run_id) as run: for read in read_types: connection = 'in/%s' % read input_paths = run_ids_connections_files[run_id].get( connection) if input_paths: for input_path in input_paths: # Get base name of input file root, ext = os.path.splitext( os.path.basename(input_path)) temp_file = input_path is_gzipped = False file_ext = os.path.splitext(input_path)[1] is_gzipped = True if file_ext\ in ['.gz', '.gzip'] else False if is_gzipped: parts = os.path.basename(input_path).split('.') root = '.'.join(parts[:-2]) # Unzip fastq temp_file = run.add_temporary_file() pigz_decompress_eg = run.new_exec_group() pigz = [ self.get_tool('pigz'), '--processes', str(self.get_cores()), '--decompress', '--keep', '--stdout', input_path ] pigz_decompress_eg.add_command( pigz, stdout_path=temp_file) # 1. Run fastqc for input file fastqsample_eg = run.new_exec_group() # @todo: its impossible to get a shorter line at # this position for pep8-compatibility... # maybe rename method? outfile = "sample" fastqsample = [self.get_tool('fastq-sample')] for option, value in config_options.items(): if option in self.possible_options: if option == 'o' or value is None: continue fastqsample.extend( ['-%s' % (option), str(value)]) fastqsample.extend(['-o', outfile]) fastqsample.append(temp_file) fastqsample_eg.add_command(fastqsample) # output compress subsample filename_params = (new_run_id, read_types[read]) subsample_file = run.add_output_file( "%s" % read, "%s%s.fastq.gz" % filename_params, [input_path]) pigz_compress_eg = run.new_exec_group() pigz_compress = [ self.get_tool('pigz'), '--processes', str(self.get_cores()), '--best', '--stdout', outfile + '.fastq' ] pigz_compress_eg.add_command( pigz_compress, stdout_path=subsample_file) # deletions remove_eg = run.new_exec_group() remove = [self.get_tool('rm'), outfile + '.fastq'] remove_eg.add_command(remove)