def _get_samples_to_process(fn, out_dir, config, force_single): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning("Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items] samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext if file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter}"' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) return out_file
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]): samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{ 'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: if not l.startswith("samplename"): cols = l.strip().split(",") samples[cols[1]].append(cols) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{ 'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def htseq_reader(align_file): """ returns a read-by-read sequence reader for a BAM or SAM file """ if bam.is_sam(align_file): read_seq = HTSeq.SAM_Reader(align_file) elif bam.is_bam(align_file): read_seq = HTSeq.BAM_Reader(align_file) else: logger.error("%s is not a SAM or BAM file" % (align_file)) sys.exit(1) return read_seq
def process_lane(item): """Prepare lanes, potentially splitting based on barcodes and reducing the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % item["rgnames"]["lane"]) file1, file2 = get_fastq_files(item) if item.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE) else: file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True) item["files"] = (file1, file2) return [item]
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % data["rgnames"]["sample"]) file1, file2 = get_fastq_files(data) if data.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE) file2 = None else: file1, file2 = fastq.downsample(file1, file2, data, NUM_DOWNSAMPLE, quick=True) data["files"] = [file1, file2] return [[data]]
def process_lane(item): """Prepare lanes, potentially splitting based on barcodes and reducing the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % item["rgnames"]["lane"]) file1, file2 = get_fastq_files(item) if item.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE) file2 = None else: file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True) item["files"] = [file1, file2] return [[item]]
def pick_kmersize(fq): """ pick an appropriate kmer size based off of https://www.biostars.org/p/201474/ tl;dr version: pick 31 unless the reads are very small, if not then guess that readlength / 2 is about right. """ if bam.is_bam(fq): readlength = bam.estimate_read_length(fq) else: readlength = fastq.estimate_read_length(fq) halfread = int(round(readlength / 2)) if halfread >= 31: kmersize = 31 else: kmersize = halfread if kmersize % 2 == 0: kmersize += 1 return kmersize
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: if not l.startswith("samplename"): cols = l.strip().split(",") samples[cols[1]].append(cols) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) return [[data]]
def get_files(target_files, config): out = [] for fname_in in target_files.keys(): if isinstance(fname_in, (list, tuple)): fnames = fname_in else: fnames = fname_in.split(";") for fname in fnames: if os.path.exists(fname): out.append(fname) else: added = False for dirname in config["inputs"]: for f in glob.glob(os.path.join(dirname, fname) + "*"): if bam.is_bam(f) or fastq.is_fastq(f): if os.path.exists(f): out.append(f) added = True assert added, "Did not find files %s in directories %s" % ( fname, config["inputs"]) return out
def filter_multimappers(align_file): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ type_flag = "b" if bam.is_bam(align_file) else "" base, ext = os.path.splitext(align_file) align_handle = pysam.Samfile(align_file, "r" + type_flag) tmp_out_file = os.path.splitext(align_file)[0] + ".tmp" def keep_fn(read): return _is_properly_mapped(read) and _is_unique(read) keep = ifilter(keep_fn, align_handle) with pysam.Samfile(tmp_out_file, "w" + type_flag, template=align_handle) as out_handle: for read in keep: out_handle.write(read) align_handle.close() out_handle.close() os.rename(tmp_out_file, align_file) return align_file
def run_salmon_decoy(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir)) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]): samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info("Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), _get_strandedness(data["config"]))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: if bam.is_sam(sam_filename): read_seq = HTSeq.SAM_Reader(sam_filename) elif bam.is_bam(sam_filename): read_seq = HTSeq.BAM_Reader(sam_filename) first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write("%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: if dd.get_umi_type(data) == "dragen": assert bam.is_bam( fastq1), f"umi_type: dragen needs a BAM file as input." data = dragen.fix_umi_dragen_bam(data, bam=fastq1) # fastq1 = bam.sort(fastq1, dd.get_config(data)) # bam.index(fastq1, dd.get_config(data)) # data["work_bam"] = fastq1 else: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2 or dd.get_umi_type(data) == "dragen": f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]