Exemple #1
0
def bed_to_interval(orig_bed, bam_file):
    """Add header and format BED bait and target files for Picard if necessary.
    """
    with open(orig_bed) as in_handle:
        line = in_handle.readline()
    if line.startswith("@"):
        yield orig_bed
    else:
        with pysam.Samfile(bam_file, "rb") as bam_handle:
            header = bam_handle.text
        with tmpfile(dir=os.path.dirname(orig_bed),
                     prefix="picardbed") as tmp_bed:
            with open(tmp_bed, "w") as out_handle:
                out_handle.write(header)
                with open(orig_bed) as in_handle:
                    for i, line in enumerate(in_handle):
                        parts = line.rstrip().split("\t")
                        if len(parts) == 4:
                            chrom, start, end, name = parts
                            strand = "+"
                        elif len(parts) >= 3:
                            chrom, start, end = parts[:3]
                            strand = "+"
                            name = "r%s" % i
                        out = [chrom, start, end, strand, name]
                        out_handle.write("\t".join(out) + "\n")
            yield tmp_bed
Exemple #2
0
def _longest_frame(rec, work_dir):
    """Find the longest translatable frame using EMBOSS sixpack.
    """
    lengths = []
    with utils.tmpfile(prefix="insix", dir=work_dir) as in_file:
        with utils.tmpfile(prefix="outsix", dir=work_dir) as out_file:
            with open(in_file, "w") as out_handle:
                SeqIO.write([rec], out_handle, "fasta")
            cl = ["sixpack", "-sequence", in_file, "-outseq", out_file,
                  "-outfile", "/dev/null"]
            with open("/dev/null", "w") as out:
                subprocess.check_call(cl, stderr=out)
            with open(out_file) as in_handle:
                for rec in SeqIO.parse(in_handle, "fasta"):
                    lengths.append(len(rec.seq))
    return max(lengths) * 3
Exemple #3
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            resources = config_utils.get_resources("bamtools", config)
            max_mem = resources.get("memory", "2048")
            with file_transaction(out_file) as tx_out_file:
                with utils.tmpfile(dir=work_dir,
                                   prefix="bammergelist") as bam_file_list:
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = ("{bamtools} merge -list {bam_file_list} | "
                           "{bamtools} sort -mem {max_mem} -out {tx_out_file}")
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
def coverage(data):
    bed_file = dd.get_coverage_experimental(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = os.path.splitext(os.path.basename(in_bam))[0]
        logger.debug("doing coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        if not file_exists(parse_file):
            total_cov = cov_class(0, None, sample)
            bam_api = pysam.AlignmentFile(in_bam)
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    print >>out_handle, "#chrom\tstart\tend\tregion\treads\tstrand\tsize\tsample\tmean\tsdt\tq10\tq20\tq4\tq50"
                with tmpfile() as tx_tmp_file:
                    # tx_tmp_file = "tmpintersect"
                    for line in region_bed:
                        chrom = line.chrom
                        start = max(line.start, 0)
                        end = line.end
                        region_file = pybedtools.BedTool(str(line), from_string=True).saveas().fn
                        coords = "%s:%s-%s" % (chrom, start, end)
                        cmd = ("samtools view -b {in_bam} {coords} | "
                               "bedtools coverage -a {region_file} -b - -hist > {tx_tmp_file}")
                        _silence_run(cmd.format(**locals()))
                        total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov)
            total_cov.write_coverage(parse_total_file)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Exemple #5
0
def bed_to_interval(orig_bed, bam_file):
    """Add header and format BED bait and target files for Picard if necessary.
    """
    with open(orig_bed) as in_handle:
        line = in_handle.readline()
    if line.startswith("@"):
        yield orig_bed
    else:
        with pysam.Samfile(bam_file, "rb") as bam_handle:
            header = bam_handle.text
        with tmpfile(dir=os.path.dirname(orig_bed), prefix="picardbed") as tmp_bed:
            with open(tmp_bed, "w") as out_handle:
                out_handle.write(header)
                with open(orig_bed) as in_handle:
                    for i, line in enumerate(in_handle):
                        parts = line.rstrip().split("\t")
                        if len(parts) == 4:
                            chrom, start, end, name = parts
                            strand = "+"
                        elif len(parts) >= 3:
                            chrom, start, end = parts[:3]
                            strand = "+"
                            name = "r%s" % i
                        out = [chrom, start, end, strand, name]
                        out_handle.write("\t".join(out) + "\n")
            yield tmp_bed
Exemple #6
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
def _longest_frame(rec, work_dir):
    """Find the longest translatable frame using EMBOSS sixpack.
    """
    lengths = []
    with utils.tmpfile(prefix="insix", dir=work_dir) as in_file:
        with utils.tmpfile(prefix="outsix", dir=work_dir) as out_file:
            with open(in_file, "w") as out_handle:
                SeqIO.write([rec], out_handle, "fasta")
            cl = [
                "sixpack", "-sequence", in_file, "-outseq", out_file,
                "-outfile", "/dev/null"
            ]
            with open("/dev/null", "w") as out:
                subprocess.check_call(cl, stderr=out)
            with open(out_file) as in_handle:
                for rec in SeqIO.parse(in_handle, "fasta"):
                    lengths.append(len(rec.seq))
    return max(lengths) * 3
Exemple #8
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file, items)
        if target:
            if isinstance(target, six.string_types) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Exemple #9
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file, items)
        if target:
            if isinstance(target, basestring) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Exemple #10
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    variant_regions = bedutils.population_variant_regions(items, merged=True)
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file,
                                        items)
        if target:
            if isinstance(target, six.string_types) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Exemple #11
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    config = items[0]["config"]
    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "varaint_regions")), items[0])
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file, items)
        if target:
            if isinstance(target, basestring) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Exemple #12
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with utils.curdir_tmpdir() as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        tx_out_prefix = os.path.splitext(tx_out_file)[0]
                        with utils.tmpfile(
                                dir=work_dir,
                                prefix="bammergelist") as bam_file_list:
                            bam_file_list = "%s.list" % os.path.splitext(
                                out_file)[0]
                            with open(bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (
                                merge_cl + " | "
                                "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                            )
                            do.run(cmd.format(**locals()), "Merge bam files",
                                   None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
def coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_coverage_experimental(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        if not file_exists(parse_file):
            total_cov = cov_class(0, None, sample)
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    HEADER = ["#chrom", "start", "end", "region", "reads",
                              "strand", "size", "sample", "mean", "sd", "cutoff10",
                              "cutoff20", "cutoff4", "cutoff50"]
                    out_handle.write("\t".join(HEADER) + "\n")
                with tmpfile() as tx_tmp_file:
                    lcount = 0
                    for chunk in robust_partition_all(batch_size, region_bed):
                        coord_batch = []
                        line_batch = ""
                        for line in chunk:
                            lcount += 1
                            chrom = line.chrom
                            start = max(line.start, 0)
                            end = line.end
                            coords = "%s:%s-%s" % (chrom, start, end)
                            coord_batch.append(coords)
                            line_batch += str(line)
                        if not coord_batch:
                            continue
                        region_file = pybedtools.BedTool(line_batch,
                                                        from_string=True).saveas().fn
                        coord_string = " ".join(coord_batch)
                        cmd = ("samtools view -b {in_bam} {coord_string} | "
                                "bedtools coverage -a {region_file} -b - "
                                "-hist > {tx_tmp_file}")
                        _silence_run(cmd.format(**locals()))
                        total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov)
                        logger.debug("Processed %d regions." % lcount)
            total_cov.write_coverage(parse_total_file)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Exemple #14
0
def _clean_regions(items, region):
    """Intersect region with target file if it exists"""
    config = items[0]["config"]
    variant_regions = bedutils.merge_overlaps(
        utils.get_in(config, ("algorithm", "varaint_regions")), items[0])
    with utils.tmpfile() as tx_out_file:
        target = subset_variant_regions(variant_regions, region, tx_out_file,
                                        items)
        if target:
            if isinstance(target, basestring) and os.path.isfile(target):
                target = _load_regions(target)
            else:
                target = [target]
            return target
Exemple #15
0
def bcbb_demultiplex(input_file, barcodes, tmp_dir, config):
    ext = ".fastq"
    base_name = os.path.splitext(input_file)[0]
    metrics_file = "%s_bc.metrics" % base_name
    out_base = "%s_--b--_--r--%s" % (base_name, ext)
    if not os.path.exists(metrics_file):
        with tmpfile(dir=tmp_dir, prefix="bc") as bc_file:
            _write_bcbb_bcfile(barcodes, bc_file)
            cl = [config["program"]["barcode"], bc_file,
                  out_base, input_file,
                  "--mismatch=%s" % (config["algorithm"]["barcode_mismatch"]),
                  "--metrics=%s" % metrics_file]
            subprocess.check_call(cl)
    return [f for f in glob.glob("%s_*_[1-9]%s" % (base_name, ext))
            if f.find("unmatched") == -1]
Exemple #16
0
def assemble_clusters(in_file, wcd_out, config):
    """Provide assembled FASTA records based on wcd clustering.
    """
    rec_find = FastaNumToRec(in_file)
    with open(wcd_out) as wcd_handle:
        for line in wcd_handle:
            nums = [int(n) for n in line.rstrip()[:-1].split()]
            if len(nums) == 1:
                yield rec_find[nums[0]]
            else:
                with utils.tmpfile(prefix="incap3", dir=config["dir"]["work"]) as input_file:
                    with open(input_file, "w") as input_handle:
                        SeqIO.write((rec_find.shortname_rec(n) for n in nums),
                                    input_handle, "fasta")
                    yield cap3_assemble(input_file, config)
                    for fname in glob.glob("%s.cap.*" % input_file):
                        os.remove(fname)
Exemple #17
0
def sabre_demultiplex(input_file, barcodes, tmp_dir, config):
    """Do barcode de-multiplexing using sabre.

    Sabre only appears to trim off the 5' side of the read so
    currently not supported.
    """
    raise NotImplementedError
    with tmpfile(dir=tmp_dir, prefix="sabrebc") as bc_file:
        out_files, unmatched_file = _write_sabre_bcfile(barcodes, input_file, bc_file)
        if not os.path.exists(unmatched_file) and not os.path.exists(out_files[0]):
            cl = [config["program"]["barcode"], "se",
                  "-m", str(config["algorithm"]["barcode_mismatch"]),
                  "-f", input_file,
                  "-b", bc_file,
                  "-u", unmatched_file]
            subprocess.check_call(cl)
    return out_files
Exemple #18
0
def cap3_assemble(in_file, config):
    """Assemble a FASTA file of clustered sequences with CAP3.
    """
    with utils.tmpfile(prefix="outcap3", dir=config["dir"]["work"]) as cap3_file:
        with open(cap3_file, "w") as out_handle:
            cl = ["cap3", in_file]
            subprocess.check_call(cl, stdout=out_handle)
        seqs = []
        with open(cap3_file) as in_handle:
            for line in in_handle:
                if line.startswith("consensus"):
                    seqs.append(line.rstrip().split()[-1])
    with open(in_file) as in_handle:
        names = []
        for rec in SeqIO.parse(in_handle, "fasta"):
            names.append(rec.id)
    return _make_seqrec("-".join(names), "".join(seqs).replace("-", ""))
Exemple #19
0
def assemble_clusters(in_file, wcd_out, config):
    """Provide assembled FASTA records based on wcd clustering.
    """
    rec_find = FastaNumToRec(in_file)
    with open(wcd_out) as wcd_handle:
        for line in wcd_handle:
            nums = [int(n) for n in line.rstrip()[:-1].split()]
            if len(nums) == 1:
                yield rec_find[nums[0]]
            else:
                with utils.tmpfile(prefix="incap3",
                                   dir=config["dir"]["work"]) as input_file:
                    with open(input_file, "w") as input_handle:
                        SeqIO.write((rec_find.shortname_rec(n) for n in nums),
                                    input_handle, "fasta")
                    yield cap3_assemble(input_file, config)
                    for fname in glob.glob("%s.cap.*" % input_file):
                        os.remove(fname)
Exemple #20
0
def cap3_assemble(in_file, config):
    """Assemble a FASTA file of clustered sequences with CAP3.
    """
    with utils.tmpfile(prefix="outcap3",
                       dir=config["dir"]["work"]) as cap3_file:
        with open(cap3_file, "w") as out_handle:
            cl = ["cap3", in_file]
            subprocess.check_call(cl, stdout=out_handle)
        seqs = []
        with open(cap3_file) as in_handle:
            for line in in_handle:
                if line.startswith("consensus"):
                    seqs.append(line.rstrip().split()[-1])
    with open(in_file) as in_handle:
        names = []
        for rec in SeqIO.parse(in_handle, "fasta"):
            names.append(rec.id)
    return _make_seqrec("-".join(names), "".join(seqs).replace("-", ""))
Exemple #21
0
def bcbb_demultiplex(input_file, barcodes, tmp_dir, config):
    ext = ".fastq"
    base_name = os.path.splitext(input_file)[0]
    metrics_file = "%s_bc.metrics" % base_name
    out_base = "%s_--b--_--r--%s" % (base_name, ext)
    if not os.path.exists(metrics_file):
        with tmpfile(dir=tmp_dir, prefix="bc") as bc_file:
            _write_bcbb_bcfile(barcodes, bc_file)
            cl = [
                config["program"]["barcode"], bc_file, out_base, input_file,
                "--mismatch=%s" % (config["algorithm"]["barcode_mismatch"]),
                "--metrics=%s" % metrics_file
            ]
            subprocess.check_call(cl)
    return [
        f for f in glob.glob("%s_*_[1-9]%s" % (base_name, ext))
        if f.find("unmatched") == -1
    ]
Exemple #22
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError(
                    "More files to merge (%s) then available open file descriptors (%s)\n"
                    "See documentation on tips for changing file limits:\n"
                    "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                    "parallel.html#tuning-systems-for-scale" %
                    (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir,
                                   prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Exemple #23
0
def sabre_demultiplex(input_file, barcodes, tmp_dir, config):
    """Do barcode de-multiplexing using sabre.

    Sabre only appears to trim off the 5' side of the read so
    currently not supported.
    """
    raise NotImplementedError
    with tmpfile(dir=tmp_dir, prefix="sabrebc") as bc_file:
        out_files, unmatched_file = _write_sabre_bcfile(
            barcodes, input_file, bc_file)
        if not os.path.exists(unmatched_file) and not os.path.exists(
                out_files[0]):
            cl = [
                config["program"]["barcode"], "se", "-m",
                str(config["algorithm"]["barcode_mismatch"]), "-f", input_file,
                "-b", bc_file, "-u", unmatched_file
            ]
            subprocess.check_call(cl)
    return out_files
Exemple #24
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with utils.curdir_tmpdir() as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        tx_out_prefix = os.path.splitext(tx_out_file)[0]
                        with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                            bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                            with open(bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Exemple #25
0
def bed_to_interval(orig_bed, bam_file):
    """Add header and format BED bait and target files for Picard if necessary.
    """
    with open(orig_bed) as in_handle:
        line = in_handle.readline()
    if line.startswith("@"):
        yield orig_bed
    else:
        with pysam.Samfile(bam_file, "rb") as bam_handle:
            header = bam_handle.text
        with tmpfile(dir=os.path.dirname(orig_bed), prefix="picardbed") as tmp_bed:
            with open(tmp_bed, "w") as out_handle:
                out_handle.write(header)
                with open(orig_bed) as in_handle:
                    for line in in_handle:
                        parts = line.rstrip().split("\t")
                        if len(parts) == 3:
                            parts.append("+")
                            parts.append("a")
                        out_handle.write("\t".join(parts) + "\n")
            yield tmp_bed
def make_refflat(genome_dir):
    """
    makes a refflat file for use with Picard from a GTF file
    """
    gtf_file = get_transcript_gtf(genome_dir)
    base, _ = os.path.splitext(gtf_file)
    refflat_file = base + ".refFlat"
    print "Making %s into a refFlat file named %s." % (gtf_file, refflat_file)
    if file_exists(refflat_file):
        print "%s already exists, skipping." % refflat_file
        return refflat_file

    with tmpfile(dir=os.getcwd(), prefix="genepred") as tmp_file:
        cmd = "gtfToGenePred {gtf_file} {tmp_file}".format(**locals())
        subprocess.check_call(cmd, shell=True)
        with open(tmp_file) as tmp_handle, open(refflat_file, "w") as out_handle:
            for line in tmp_handle:
                l = line.split("\t")
                l = [l[0]] + l
                out_handle.write("\t".join(l) + "\n")
    return refflat_file
Exemple #27
0
def make_refflat(genome_dir):
    """
    makes a refflat file for use with Picard from a GTF file
    """
    gtf_file = get_transcript_gtf(genome_dir)
    base, _ = os.path.splitext(gtf_file)
    refflat_file = base + ".refFlat"
    print "Making %s into a refFlat file named %s." % (gtf_file, refflat_file)
    if file_exists(refflat_file):
        print "%s already exists, skipping." % refflat_file
        return refflat_file

    with tmpfile(dir=os.getcwd(), prefix="genepred") as tmp_file:
        cmd = "gtfToGenePred {gtf_file} {tmp_file}".format(**locals())
        subprocess.check_call(cmd, shell=True)
        with open(tmp_file) as tmp_handle, open(refflat_file,
                                                "w") as out_handle:
            for line in tmp_handle:
                l = line.split("\t")
                l = [l[0]] + l
                out_handle.write("\t".join(l) + "\n")
    return refflat_file
Exemple #28
0
def bed_to_interval(orig_bed, bam_file):
    """Add header and format BED bait and target files for Picard if necessary.
    """
    with open(orig_bed) as in_handle:
        line = in_handle.readline()
    if line.startswith("@"):
        yield orig_bed
    else:
        bam_handle = pysam.Samfile(bam_file, "rb")
        with contextlib.closing(bam_handle):
            header = bam_handle.text
        with tmpfile(dir=os.getcwd(), prefix="picardbed") as tmp_bed:
            with open(tmp_bed, "w") as out_handle:
                out_handle.write(header)
                with open(orig_bed) as in_handle:
                    for line in in_handle:
                        parts = line.rstrip().split("\t")
                        if len(parts) == 3:
                            parts.append("+")
                            parts.append("a")
                        out_handle.write("\t".join(parts) + "\n")
            yield tmp_bed
Exemple #29
0
def coverage(data):
    bed_file = dd.get_coverage_experimental(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = os.path.splitext(os.path.basename(in_bam))[0]
        logger.debug("doing coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        if not file_exists(parse_file):
            total_cov = cov_class(0, None, sample)
            bam_api = pysam.AlignmentFile(in_bam)
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    print >> out_handle, "#chrom\tstart\tend\tregion\treads\tstrand\tsize\tsample\tmean\tsdt\tq10\tq20\tq4\tq50"
                with tmpfile() as tx_tmp_file:
                    # tx_tmp_file = "tmpintersect"
                    for line in region_bed:
                        chrom = line.chrom
                        start = max(line.start, 0)
                        end = line.end
                        region_file = pybedtools.BedTool(
                            str(line), from_string=True).saveas().fn
                        coords = "%s:%s-%s" % (chrom, start, end)
                        cmd = (
                            "samtools view -b {in_bam} {coords} | "
                            "bedtools coverage -a {region_file} -b - -hist > {tx_tmp_file}"
                        )
                        _silence_run(cmd.format(**locals()))
                        total_cov = _get_exome_coverage_stats(
                            os.path.abspath(tx_tmp_file), sample, out_tx,
                            total_cov)
            total_cov.write_coverage(parse_total_file)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Exemple #30
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses samtools or bamtools for merging, both of which have some cavaets.
    samtools can run into file system limits on command line length, while
    bamtools runs into open file handle issues.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with utils.curdir_tmpdir() as tmpdir:
                with utils.chdir(tmpdir):
                    if len(bam_files) < 4096:
                        merge_cl = _samtools_cat(bam_files, tmpdir)
                    else:
                        merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        tx_out_prefix = os.path.splitext(tx_out_file)[0]
                        with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                            bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                            with open(bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Exemple #31
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError("More files to merge (%s) then available open file descriptors (%s)\n"
                              "See documentation on tips for changing file limits:\n"
                              "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                              "parallel.html#tuning-systems-for-scale"
                              % (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = ("{bamtools} merge -list {bam_file_list} | "
                           "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Exemple #32
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    with open(bam_file_list, "w") as out_handle:
                        for f in bam_files:
                            out_handle.write("%s\n" % f)
                    cmd = [config_utils.get_program("bamtools", config),
                           "merge", "-list", bam_file_list, "-out", tx_out_file]
                    do.run(cmd, "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Exemple #33
0
def consensus(peakfiles, consensusfile, data, pad=250):
    """call consensus peaks from a set of narrow/broad peakfiles
    we use this method:
    https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html
    """
    if utils.file_exists(consensusfile):
        return consensusfile

    try:
        bedops = config_utils.get_program("bedops", data)
    except config_utils.CmdNotFound:
        logger.info("bedops not found, skipping consensus peak calling. do a "
                    "--tools update to install bedops.")
        return None
    try:
        sortbed = config_utils.get_program("sort-bed", data)
    except config_utils.CmdNotFound:
        logger.info("sort-bed not found, skipping consensus peak calling. do "
                    "--tools update to install sort-bed.")
        return None
    try:
        bedmap = config_utils.get_program("bedmap", data)
    except config_utils.CmdNotFound:
        logger.info("bedmap not found, skipping consensus peak calling. do a "
                    "--tools update to install bedmap.")
        return None

    logger.info(f"Calling consensus peaks on {','.join(peakfiles)}")
    logger.info(f"Removing low quality peaks from {','.join(peakfiles)}")
    filteredsummits = []
    for fn in peakfiles:
        filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name
        df = remove_low_quality_peaks(fn, qval=0.05)
        df.to_csv(filteredpeak, index=False, header=False, sep="\t")
        filteredsummit = peakfile_to_summitfile(filteredpeak)
        filteredsummits.append(filteredsummit)
    peakfiles = filteredsummits

    with file_transaction(consensusfile) as tx_consensus_file:
        message = (f"Combining summits of {' '.join(peakfiles)} and "
                   f"expanding {pad} bases.")
        with utils.tmpfile(suffix=".bed") as tmpbed:
            slopcommand = f"{bedops} --range {pad} -u {' '.join(peakfiles)} > {tmpbed}"
            do.run(slopcommand, message)
            iteration = 0
            while os.path.getsize(tmpbed):
                iteration = iteration + 1
                iterationbed = NamedTemporaryFile(suffix=".bed",
                                                  delete=False).name
                with utils.tmpfile(suffix="bed") as mergedbed, \
                     utils.tmpfile(suffix="bed") as intermediatebed, \
                     utils.tmpfile(suffix="bed") as leftoverbed, \
                     utils.tmpfile(suffix="bed") as tmpsolutionbed:
                    mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | "
                                f"{bedops} -u --range 0:1 - > "
                                f"{mergedbed}")
                    message = f"Merging non-overlapping peaks, iteration {iteration}."
                    do.run(mergecmd, message)
                    nitems = len(open(mergedbed).readlines())
                    message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks."
                    highscorecmd = (
                        f"{bedmap} --max-element {mergedbed} {tmpbed} |"
                        f"{sortbed} - > "
                        f"{iterationbed}")
                    do.run(highscorecmd, message)
                    message = f"Checking if there are peaks left to merge."
                    anyleftcmd = (
                        f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}"
                    )
                    do.run(anyleftcmd, message)
                    shutil.move(intermediatebed, tmpbed)
                    nitems = len(open(iterationbed).readlines())
                    message = f"Adding {nitems} peaks to consensus peaks."
                    if utils.file_exists(tx_consensus_file):
                        consensuscmd = (
                            f"{bedops} -u {tx_consensus_file} {iterationbed} > {tmpsolutionbed}"
                        )
                        do.run(consensuscmd, message)
                        shutil.move(tmpsolutionbed, tx_consensus_file)
                    else:
                        shutil.move(iterationbed, tx_consensus_file)
    return consensusfile
Exemple #34
0
def blat_search(rec, db, tmp_dir):
    with utils.tmpfile(prefix="inblat", dir=tmp_dir) as in_file:
        with open(in_file, "w") as out_handle:
            SeqIO.write([rec], out_handle, "fasta")
        with utils.tmpfile(prefix="outblat", dir=tmp_dir) as blat_out:
            return _do_blat(in_file, db, blat_out)
Exemple #35
0
def blat_search(rec, db, tmp_dir):
    with utils.tmpfile(prefix="inblat", dir=tmp_dir) as in_file:
        with open(in_file, "w") as out_handle:
            SeqIO.write([rec], out_handle, "fasta")
        with utils.tmpfile(prefix="outblat", dir=tmp_dir) as blat_out:
            return _do_blat(in_file, db, blat_out)