def bed_to_interval(orig_bed, bam_file): """Add header and format BED bait and target files for Picard if necessary. """ with open(orig_bed) as in_handle: line = in_handle.readline() if line.startswith("@"): yield orig_bed else: with pysam.Samfile(bam_file, "rb") as bam_handle: header = bam_handle.text with tmpfile(dir=os.path.dirname(orig_bed), prefix="picardbed") as tmp_bed: with open(tmp_bed, "w") as out_handle: out_handle.write(header) with open(orig_bed) as in_handle: for i, line in enumerate(in_handle): parts = line.rstrip().split("\t") if len(parts) == 4: chrom, start, end, name = parts strand = "+" elif len(parts) >= 3: chrom, start, end = parts[:3] strand = "+" name = "r%s" % i out = [chrom, start, end, strand, name] out_handle.write("\t".join(out) + "\n") yield tmp_bed
def _longest_frame(rec, work_dir): """Find the longest translatable frame using EMBOSS sixpack. """ lengths = [] with utils.tmpfile(prefix="insix", dir=work_dir) as in_file: with utils.tmpfile(prefix="outsix", dir=work_dir) as out_file: with open(in_file, "w") as out_handle: SeqIO.write([rec], out_handle, "fasta") cl = ["sixpack", "-sequence", in_file, "-outseq", out_file, "-outfile", "/dev/null"] with open("/dev/null", "w") as out: subprocess.check_call(cl, stderr=out) with open(out_file) as in_handle: for rec in SeqIO.parse(in_handle, "fasta"): lengths.append(len(rec.seq)) return max(lengths) * 3
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) resources = config_utils.get_resources("bamtools", config) max_mem = resources.get("memory", "2048") with file_transaction(out_file) as tx_out_file: with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ("{bamtools} merge -list {bam_file_list} | " "{bamtools} sort -mem {max_mem} -out {tx_out_file}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def coverage(data): bed_file = dd.get_coverage_experimental(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = os.path.splitext(os.path.basename(in_bam))[0] logger.debug("doing coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") if not file_exists(parse_file): total_cov = cov_class(0, None, sample) bam_api = pysam.AlignmentFile(in_bam) with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "#chrom\tstart\tend\tregion\treads\tstrand\tsize\tsample\tmean\tsdt\tq10\tq20\tq4\tq50" with tmpfile() as tx_tmp_file: # tx_tmp_file = "tmpintersect" for line in region_bed: chrom = line.chrom start = max(line.start, 0) end = line.end region_file = pybedtools.BedTool(str(line), from_string=True).saveas().fn coords = "%s:%s-%s" % (chrom, start, end) cmd = ("samtools view -b {in_bam} {coords} | " "bedtools coverage -a {region_file} -b - -hist > {tx_tmp_file}") _silence_run(cmd.format(**locals())) total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov) total_cov.write_coverage(parse_total_file) data['coverage'] = os.path.abspath(parse_file) return data
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def _longest_frame(rec, work_dir): """Find the longest translatable frame using EMBOSS sixpack. """ lengths = [] with utils.tmpfile(prefix="insix", dir=work_dir) as in_file: with utils.tmpfile(prefix="outsix", dir=work_dir) as out_file: with open(in_file, "w") as out_handle: SeqIO.write([rec], out_handle, "fasta") cl = [ "sixpack", "-sequence", in_file, "-outseq", out_file, "-outfile", "/dev/null" ] with open("/dev/null", "w") as out: subprocess.check_call(cl, stderr=out) with open(out_file) as in_handle: for rec in SeqIO.parse(in_handle, "fasta"): lengths.append(len(rec.seq)) return max(lengths) * 3
def _clean_regions(items, region): """Intersect region with target file if it exists""" variant_regions = bedutils.population_variant_regions(items, merged=True) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, six.string_types) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def _clean_regions(items, region): """Intersect region with target file if it exists""" variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def _clean_regions(items, region): """Intersect region with target file if it exists""" config = items[0]["config"] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "varaint_regions")), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [ merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate( utils.partition_all(batch_size, bam_files)) ] with utils.curdir_tmpdir() as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile( dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext( out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def coverage(data): AVERAGE_REGION_STRING_LENGTH = 100 bed_file = dd.get_coverage_experimental(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") if not file_exists(parse_file): total_cov = cov_class(0, None, sample) with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: HEADER = ["#chrom", "start", "end", "region", "reads", "strand", "size", "sample", "mean", "sd", "cutoff10", "cutoff20", "cutoff4", "cutoff50"] out_handle.write("\t".join(HEADER) + "\n") with tmpfile() as tx_tmp_file: lcount = 0 for chunk in robust_partition_all(batch_size, region_bed): coord_batch = [] line_batch = "" for line in chunk: lcount += 1 chrom = line.chrom start = max(line.start, 0) end = line.end coords = "%s:%s-%s" % (chrom, start, end) coord_batch.append(coords) line_batch += str(line) if not coord_batch: continue region_file = pybedtools.BedTool(line_batch, from_string=True).saveas().fn coord_string = " ".join(coord_batch) cmd = ("samtools view -b {in_bam} {coord_string} | " "bedtools coverage -a {region_file} -b - " "-hist > {tx_tmp_file}") _silence_run(cmd.format(**locals())) total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov) logger.debug("Processed %d regions." % lcount) total_cov.write_coverage(parse_total_file) data['coverage'] = os.path.abspath(parse_file) return data
def _clean_regions(items, region): """Intersect region with target file if it exists""" config = items[0]["config"] variant_regions = bedutils.merge_overlaps( utils.get_in(config, ("algorithm", "varaint_regions")), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def bcbb_demultiplex(input_file, barcodes, tmp_dir, config): ext = ".fastq" base_name = os.path.splitext(input_file)[0] metrics_file = "%s_bc.metrics" % base_name out_base = "%s_--b--_--r--%s" % (base_name, ext) if not os.path.exists(metrics_file): with tmpfile(dir=tmp_dir, prefix="bc") as bc_file: _write_bcbb_bcfile(barcodes, bc_file) cl = [config["program"]["barcode"], bc_file, out_base, input_file, "--mismatch=%s" % (config["algorithm"]["barcode_mismatch"]), "--metrics=%s" % metrics_file] subprocess.check_call(cl) return [f for f in glob.glob("%s_*_[1-9]%s" % (base_name, ext)) if f.find("unmatched") == -1]
def assemble_clusters(in_file, wcd_out, config): """Provide assembled FASTA records based on wcd clustering. """ rec_find = FastaNumToRec(in_file) with open(wcd_out) as wcd_handle: for line in wcd_handle: nums = [int(n) for n in line.rstrip()[:-1].split()] if len(nums) == 1: yield rec_find[nums[0]] else: with utils.tmpfile(prefix="incap3", dir=config["dir"]["work"]) as input_file: with open(input_file, "w") as input_handle: SeqIO.write((rec_find.shortname_rec(n) for n in nums), input_handle, "fasta") yield cap3_assemble(input_file, config) for fname in glob.glob("%s.cap.*" % input_file): os.remove(fname)
def sabre_demultiplex(input_file, barcodes, tmp_dir, config): """Do barcode de-multiplexing using sabre. Sabre only appears to trim off the 5' side of the read so currently not supported. """ raise NotImplementedError with tmpfile(dir=tmp_dir, prefix="sabrebc") as bc_file: out_files, unmatched_file = _write_sabre_bcfile(barcodes, input_file, bc_file) if not os.path.exists(unmatched_file) and not os.path.exists(out_files[0]): cl = [config["program"]["barcode"], "se", "-m", str(config["algorithm"]["barcode_mismatch"]), "-f", input_file, "-b", bc_file, "-u", unmatched_file] subprocess.check_call(cl) return out_files
def cap3_assemble(in_file, config): """Assemble a FASTA file of clustered sequences with CAP3. """ with utils.tmpfile(prefix="outcap3", dir=config["dir"]["work"]) as cap3_file: with open(cap3_file, "w") as out_handle: cl = ["cap3", in_file] subprocess.check_call(cl, stdout=out_handle) seqs = [] with open(cap3_file) as in_handle: for line in in_handle: if line.startswith("consensus"): seqs.append(line.rstrip().split()[-1]) with open(in_file) as in_handle: names = [] for rec in SeqIO.parse(in_handle, "fasta"): names.append(rec.id) return _make_seqrec("-".join(names), "".join(seqs).replace("-", ""))
def bcbb_demultiplex(input_file, barcodes, tmp_dir, config): ext = ".fastq" base_name = os.path.splitext(input_file)[0] metrics_file = "%s_bc.metrics" % base_name out_base = "%s_--b--_--r--%s" % (base_name, ext) if not os.path.exists(metrics_file): with tmpfile(dir=tmp_dir, prefix="bc") as bc_file: _write_bcbb_bcfile(barcodes, bc_file) cl = [ config["program"]["barcode"], bc_file, out_base, input_file, "--mismatch=%s" % (config["algorithm"]["barcode_mismatch"]), "--metrics=%s" % metrics_file ] subprocess.check_call(cl) return [ f for f in glob.glob("%s_*_[1-9]%s" % (base_name, ext)) if f.find("unmatched") == -1 ]
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError( "More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ( "{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def sabre_demultiplex(input_file, barcodes, tmp_dir, config): """Do barcode de-multiplexing using sabre. Sabre only appears to trim off the 5' side of the read so currently not supported. """ raise NotImplementedError with tmpfile(dir=tmp_dir, prefix="sabrebc") as bc_file: out_files, unmatched_file = _write_sabre_bcfile( barcodes, input_file, bc_file) if not os.path.exists(unmatched_file) and not os.path.exists( out_files[0]): cl = [ config["program"]["barcode"], "se", "-m", str(config["algorithm"]["barcode_mismatch"]), "-f", input_file, "-b", bc_file, "-u", unmatched_file ] subprocess.check_call(cl) return out_files
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if batch is not None: base, ext = os.path.splitext(out_file) out_file = "%s-b%s%s" % (base, batch, ext) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") batch_size = system.open_file_limit() - 100 if len(bam_files) > batch_size: bam_files = [merge_bam_files(xs, work_dir, config, out_file, i) for i, xs in enumerate(utils.partition_all(batch_size, bam_files))] with utils.curdir_tmpdir() as tmpdir: with utils.chdir(tmpdir): merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def bed_to_interval(orig_bed, bam_file): """Add header and format BED bait and target files for Picard if necessary. """ with open(orig_bed) as in_handle: line = in_handle.readline() if line.startswith("@"): yield orig_bed else: with pysam.Samfile(bam_file, "rb") as bam_handle: header = bam_handle.text with tmpfile(dir=os.path.dirname(orig_bed), prefix="picardbed") as tmp_bed: with open(tmp_bed, "w") as out_handle: out_handle.write(header) with open(orig_bed) as in_handle: for line in in_handle: parts = line.rstrip().split("\t") if len(parts) == 3: parts.append("+") parts.append("a") out_handle.write("\t".join(parts) + "\n") yield tmp_bed
def make_refflat(genome_dir): """ makes a refflat file for use with Picard from a GTF file """ gtf_file = get_transcript_gtf(genome_dir) base, _ = os.path.splitext(gtf_file) refflat_file = base + ".refFlat" print "Making %s into a refFlat file named %s." % (gtf_file, refflat_file) if file_exists(refflat_file): print "%s already exists, skipping." % refflat_file return refflat_file with tmpfile(dir=os.getcwd(), prefix="genepred") as tmp_file: cmd = "gtfToGenePred {gtf_file} {tmp_file}".format(**locals()) subprocess.check_call(cmd, shell=True) with open(tmp_file) as tmp_handle, open(refflat_file, "w") as out_handle: for line in tmp_handle: l = line.split("\t") l = [l[0]] + l out_handle.write("\t".join(l) + "\n") return refflat_file
def bed_to_interval(orig_bed, bam_file): """Add header and format BED bait and target files for Picard if necessary. """ with open(orig_bed) as in_handle: line = in_handle.readline() if line.startswith("@"): yield orig_bed else: bam_handle = pysam.Samfile(bam_file, "rb") with contextlib.closing(bam_handle): header = bam_handle.text with tmpfile(dir=os.getcwd(), prefix="picardbed") as tmp_bed: with open(tmp_bed, "w") as out_handle: out_handle.write(header) with open(orig_bed) as in_handle: for line in in_handle: parts = line.rstrip().split("\t") if len(parts) == 3: parts.append("+") parts.append("a") out_handle.write("\t".join(parts) + "\n") yield tmp_bed
def coverage(data): bed_file = dd.get_coverage_experimental(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = os.path.splitext(os.path.basename(in_bam))[0] logger.debug("doing coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") if not file_exists(parse_file): total_cov = cov_class(0, None, sample) bam_api = pysam.AlignmentFile(in_bam) with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "#chrom\tstart\tend\tregion\treads\tstrand\tsize\tsample\tmean\tsdt\tq10\tq20\tq4\tq50" with tmpfile() as tx_tmp_file: # tx_tmp_file = "tmpintersect" for line in region_bed: chrom = line.chrom start = max(line.start, 0) end = line.end region_file = pybedtools.BedTool( str(line), from_string=True).saveas().fn coords = "%s:%s-%s" % (chrom, start, end) cmd = ( "samtools view -b {in_bam} {coords} | " "bedtools coverage -a {region_file} -b - -hist > {tx_tmp_file}" ) _silence_run(cmd.format(**locals())) total_cov = _get_exome_coverage_stats( os.path.abspath(tx_tmp_file), sample, out_tx, total_cov) total_cov.write_coverage(parse_total_file) data['coverage'] = os.path.abspath(parse_file) return data
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses samtools or bamtools for merging, both of which have some cavaets. samtools can run into file system limits on command line length, while bamtools runs into open file handle issues. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") with utils.curdir_tmpdir() as tmpdir: with utils.chdir(tmpdir): if len(bam_files) < 4096: merge_cl = _samtools_cat(bam_files, tmpdir) else: merge_cl = _bamtools_merge(bam_files) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = (merge_cl + " | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) bam.index(out_file, config) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"): bamtools = config_utils.get_program("bamtools", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") if len(bam_files) > system.open_file_limit(): raise IOError("More files to merge (%s) then available open file descriptors (%s)\n" "See documentation on tips for changing file limits:\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "parallel.html#tuning-systems-for-scale" % (len(bam_files), system.open_file_limit())) with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: bam_file_list = "%s.list" % os.path.splitext(out_file)[0] with open(bam_file_list, "w") as out_handle: for f in sorted(bam_files): out_handle.write("%s\n" % f) cmd = ("{bamtools} merge -list {bam_file_list} | " "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") do.run(cmd.format(**locals()), "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def merge_bam_files(bam_files, work_dir, config, out_file=None): """Merge multiple BAM files from a sample into a single BAM for processing. Uses bamtools for merging, which handles large numbers of input BAMs. """ if len(bam_files) == 1: return bam_files[0] else: if out_file is None: out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0])) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list: with open(bam_file_list, "w") as out_handle: for f in bam_files: out_handle.write("%s\n" % f) cmd = [config_utils.get_program("bamtools", config), "merge", "-list", bam_file_list, "-out", tx_out_file] do.run(cmd, "Merge bam files", None) for b in bam_files: utils.save_diskspace(b, "BAM merged to %s" % out_file, config) picard = broad.runner_from_config(config) picard.run_fn("picard_index", out_file) return out_file
def consensus(peakfiles, consensusfile, data, pad=250): """call consensus peaks from a set of narrow/broad peakfiles we use this method: https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html """ if utils.file_exists(consensusfile): return consensusfile try: bedops = config_utils.get_program("bedops", data) except config_utils.CmdNotFound: logger.info("bedops not found, skipping consensus peak calling. do a " "--tools update to install bedops.") return None try: sortbed = config_utils.get_program("sort-bed", data) except config_utils.CmdNotFound: logger.info("sort-bed not found, skipping consensus peak calling. do " "--tools update to install sort-bed.") return None try: bedmap = config_utils.get_program("bedmap", data) except config_utils.CmdNotFound: logger.info("bedmap not found, skipping consensus peak calling. do a " "--tools update to install bedmap.") return None logger.info(f"Calling consensus peaks on {','.join(peakfiles)}") logger.info(f"Removing low quality peaks from {','.join(peakfiles)}") filteredsummits = [] for fn in peakfiles: filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name df = remove_low_quality_peaks(fn, qval=0.05) df.to_csv(filteredpeak, index=False, header=False, sep="\t") filteredsummit = peakfile_to_summitfile(filteredpeak) filteredsummits.append(filteredsummit) peakfiles = filteredsummits with file_transaction(consensusfile) as tx_consensus_file: message = (f"Combining summits of {' '.join(peakfiles)} and " f"expanding {pad} bases.") with utils.tmpfile(suffix=".bed") as tmpbed: slopcommand = f"{bedops} --range {pad} -u {' '.join(peakfiles)} > {tmpbed}" do.run(slopcommand, message) iteration = 0 while os.path.getsize(tmpbed): iteration = iteration + 1 iterationbed = NamedTemporaryFile(suffix=".bed", delete=False).name with utils.tmpfile(suffix="bed") as mergedbed, \ utils.tmpfile(suffix="bed") as intermediatebed, \ utils.tmpfile(suffix="bed") as leftoverbed, \ utils.tmpfile(suffix="bed") as tmpsolutionbed: mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | " f"{bedops} -u --range 0:1 - > " f"{mergedbed}") message = f"Merging non-overlapping peaks, iteration {iteration}." do.run(mergecmd, message) nitems = len(open(mergedbed).readlines()) message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks." highscorecmd = ( f"{bedmap} --max-element {mergedbed} {tmpbed} |" f"{sortbed} - > " f"{iterationbed}") do.run(highscorecmd, message) message = f"Checking if there are peaks left to merge." anyleftcmd = ( f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}" ) do.run(anyleftcmd, message) shutil.move(intermediatebed, tmpbed) nitems = len(open(iterationbed).readlines()) message = f"Adding {nitems} peaks to consensus peaks." if utils.file_exists(tx_consensus_file): consensuscmd = ( f"{bedops} -u {tx_consensus_file} {iterationbed} > {tmpsolutionbed}" ) do.run(consensuscmd, message) shutil.move(tmpsolutionbed, tx_consensus_file) else: shutil.move(iterationbed, tx_consensus_file) return consensusfile
def blat_search(rec, db, tmp_dir): with utils.tmpfile(prefix="inblat", dir=tmp_dir) as in_file: with open(in_file, "w") as out_handle: SeqIO.write([rec], out_handle, "fasta") with utils.tmpfile(prefix="outblat", dir=tmp_dir) as blat_out: return _do_blat(in_file, db, blat_out)