Exemple #1
0
def breakpoints_by_caller(bed_files):
    """
    given a list of BED files of the form
    chrom start end caller
    return a BedTool of breakpoints as each line with
    the fourth column the caller with evidence for the breakpoint
    chr1 1 10 caller1 -> chr1 1 1 caller1
    chr1 1 20 caller2    chr1 1 1 caller2
                         chr1 10 10 caller1
                         chr1 20 20 caller2
    """
    merged = concat(bed_files)
    if not merged:
        return []
    grouped_start = merged.groupby(
        g=[1, 2,
           2], c=4, o=["distinct"]).filter(lambda r: r.end > r.start).saveas()
    grouped_end = merged.groupby(
        g=[1, 3,
           3], c=4, o=["distinct"]).filter(lambda r: r.end > r.start).saveas()
    together = concat([grouped_start, grouped_end])
    if together:
        final = together.expand(c=4)
        final = final.sort()
        return final
def summary(items):
    cutoff = DEFAULT_COVERAGE_CUTOFF
    data = items[0]
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    combined_bed = bed.concat([coverage_bed, priority_bed])
    clean_bed = bedutils.clean_file(combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn
    bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))

    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file) and utils.file_exists(bed_file):
        with file_transaction(data, out_file) as tx_out_file:
            chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
            cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
            do.run(cmd.format(**locals()), "Prep chanjo database")
            for data in items:
                sample = dd.get_sample_name(data)
                bam_file = data["work_bam"]
                cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                       "{bam_file} {bed_file} | "
                       "{chanjo} --db {tx_out_file} import")
                do.run(cmd.format(**locals()), "Chanjo coverage", data)
    incomplete = incomplete_regions(out_file, batch, out_dir)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file,
                                "incomplete": incomplete}
        out.append([data])
    return out
Exemple #3
0
def _merge_sv_calls(bed_files, out_file, data):
    """
    merge a set of structural variant BED files and return a bedtools object
    """
    if bed_files:
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                merged = concat(bed_files)
                merged = merged.sort().merge().saveas(tx_out_file)
        return pybedtools.BedTool(out_file)
Exemple #4
0
def breakpoints_by_caller(bed_files):
    """
    given a list of BED files of the form
    chrom start end caller
    return a BedTool of breakpoints as each line with
    the fourth column the caller with evidence for the breakpoint
    chr1 1 10 caller1 -> chr1 1 1 caller1
    chr1 1 20 caller2    chr1 1 1 caller2
                         chr1 10 10 caller1
                         chr1 20 20 caller2
    """
    merged = concat(bed_files)
    if not merged:
        return []
    grouped_start = merged.groupby(g=[1, 2, 2], c=4, o=["distinct"])
    grouped_end = merged.groupby(g=[1, 3, 3], c=4, o=["distinct"])
    together = concat([grouped_start, grouped_end])
    final = together.expand(c=4)
    final = final.sort()
    return final
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        if coverage_bed:
            mini_coverage = bed.minimize(coverage_bed).fn
        if priority_bed:
            mini_priority = bed.minimize(priority_bed).fn
        if coverage_bed and priority_bed:
            combined_bed = bed.concat([mini_coverage, mini_priority]).fn
        elif coverage_bed:
            combined_bed = mini_coverage
        elif priority_bed:
            combined_bed = mini_priority
        else:  # no coverage or priority file has been set
            return items
        clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)

        if bed_file and utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
                cmd = "{chanjo} --db {tx_out_file} build {bed_file}"
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import"
                    )
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        if bed_file:
            os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
Exemple #6
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        combined_bed = bed.concat([coverage_bed, priority_bed])
        clean_bed = bedutils.clean_file(
            combined_bed.fn,
            data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
        if utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable),
                                      "chanjo")
                cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import")
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
Exemple #7
0
def get_splicejunction_file(align_dir, data):
    """
    locate the splice junction file from hisat2. hisat2 outputs a novel
    splicesites file to go along with the provided file, if available.
    this combines the two together and outputs a combined file of all
    of the known and novel splice junctions
    """
    samplename = dd.get_sample_name(data)
    align_dir = os.path.dirname(dd.get_work_bam(data))
    knownfile = get_known_splicesites_file(align_dir, data)
    novelfile = os.path.join(align_dir, "%s-novelsplicesites.bed" % samplename)
    bed_files = [x for x in [knownfile, novelfile] if file_exists(x)]
    splicejunction = bed.concat(bed_files)
    splicejunctionfile = os.path.join(align_dir,
                                      "%s-splicejunctions.bed" % samplename)
    if splicejunction:
        splicejunction.saveas(splicejunctionfile)
        return splicejunctionfile
    else:
        return None