def coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_coverage_experimental(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        if not file_exists(parse_file):
            total_cov = cov_class(0, None, sample)
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    HEADER = ["#chrom", "start", "end", "region", "reads",
                              "strand", "size", "sample", "mean", "sd", "cutoff10",
                              "cutoff20", "cutoff4", "cutoff50"]
                    out_handle.write("\t".join(HEADER) + "\n")
                with tmpfile() as tx_tmp_file:
                    lcount = 0
                    for chunk in robust_partition_all(batch_size, region_bed):
                        coord_batch = []
                        line_batch = ""
                        for line in chunk:
                            lcount += 1
                            chrom = line.chrom
                            start = max(line.start, 0)
                            end = line.end
                            coords = "%s:%s-%s" % (chrom, start, end)
                            coord_batch.append(coords)
                            line_batch += str(line)
                        if not coord_batch:
                            continue
                        region_file = pybedtools.BedTool(line_batch,
                                                        from_string=True).saveas().fn
                        coord_string = " ".join(coord_batch)
                        cmd = ("samtools view -b {in_bam} {coord_string} | "
                                "bedtools coverage -a {region_file} -b - "
                                "-hist > {tx_tmp_file}")
                        _silence_run(cmd.format(**locals()))
                        total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov)
                        logger.debug("Processed %d regions." % lcount)
            total_cov.write_coverage(parse_total_file)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Esempio n. 2
0
def priority_coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    coord_batch.append(coords)
                    line_batch += "%s\t%s\t%s\n" % (chrom, start, end)
                if not coord_batch:
                    continue
                region_file = pybedtools.BedTool(line_batch,
                                                 from_string=True).saveas().fn
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                samtools = config_utils.get_program("samtools", data["config"])
                bedtools = config_utils.get_program("bedtools", data["config"])
                cmd = (
                    "{samtools} view -b {in_bam} {coord_string} | "
                    "{bedtools} coverage -sorted -d -a {region_file} -b - | "
                    "awk {awk_string} >> {tx_out_file}")
                _silence_run(cmd.format(**locals()))
        data['priority_coverage'] = os.path.abspath(out_file)
    return data
Esempio n. 3
0
def priority_coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    coord_batch.append(coords)
                    line_batch += "%s\t%s\t%s\n" % (chrom, start, end)
                if not coord_batch:
                    continue
                region_file = pybedtools.BedTool(line_batch,
                                                from_string=True).saveas().fn
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                samtools = config_utils.get_program("samtools", data["config"])
                bedtools = config_utils.get_program("bedtools", data["config"])
                cmd = ("{samtools} view -b {in_bam} {coord_string} | "
                        "{bedtools} coverage -sorted -d -a {region_file} -b - | "
                        "awk {awk_string} >> {tx_out_file}")
                _silence_run(cmd.format(**locals()))
        data['priority_coverage'] = os.path.abspath(out_file)
    return data