Example #1
0
def _get_target_access_files(cov_interval, data, work_dir):
    """Retrieve target and access files based on the type of data to process.

    pick targets, anti-targets and access files based on analysis type
    http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
    """
    base_regions = regions.get_sv_bed(data)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions within 10kb of genes
        if cov_interval == "genome":
            base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir)
            if base_regions:
                base_regions = shared.remove_exclude_regions(
                    base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)

    target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir)
    if cov_interval == "amplicon":
        return target_bed, target_bed
    elif cov_interval == "genome":
        return target_bed, target_bed
    else:
        access_file = _create_access_file(dd.get_ref_file(data),
                                          _sv_workdir(data), data)
        return target_bed, access_file
Example #2
0
def _get_target_access_files(cov_interval, data, work_dir):
    """Retrieve target and access files based on the type of data to process.

    pick targets, anti-targets and access files based on analysis type
    http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
    """
    base_regions = regions.get_sv_bed(data)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions within 10kb of genes
        if cov_interval == "genome":
            base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir)
            if base_regions:
                base_regions = shared.remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)

    target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir)
    if cov_interval == "amplicon":
        return target_bed, target_bed
    elif cov_interval == "genome":
        return target_bed, target_bed
    else:
        access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data)
        return target_bed, access_file
Example #3
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data, prefix="cleaned-")
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data)
        clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning("Can't run Seq2C without a svregions or variant_regions BED file")
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    elif regions.get_sv_bed(data):
        dd.set_sv_regions(data, clean_file(regions.get_sv_bed(data), data, prefix="svregions-"))
    return data
Example #4
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data, prefix="cleaned-")
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data)
        clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning("Can't run Seq2C without a svregions or variant_regions BED file")
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    elif regions.get_sv_bed(data):
        dd.set_sv_regions(data, clean_file(regions.get_sv_bed(data), data, prefix="svregions-"))
    return data
Example #5
0
def _create_subset_file(in_file, work_dir, data):
    """Subset the VCF to a set of smaller regions, matching what was used for CNV calling.
    """
    out_file = os.path.join(work_dir, "%s-orig.bcf" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            region_bed = regions.get_sv_bed(data)
            if not region_bed:
                region_bed = regions.get_sv_bed(data, "transcripts1e4", work_dir)
            cmd = "bcftools view -R {region_bed} -o {tx_out_file} -O b {in_file}"
            do.run(cmd.format(**locals()), "Extract SV only regions for BubbleTree")
    return out_file
Example #6
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", regions.get_sv_bed(data), None, None),
                        ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Example #7
0
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            extra_fields.append(gene_index)
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                                   (max_distance, gene_index))
                cmd = ("sort -k1,1 -k2,2n {in_file} | "
                       "bedtools closest -d -t all -a - -b {gene_file} | "
                       "{distance_filter} | cut -f 1-{max_column} | "
                       "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate BED file with gene info")
        return out_file
    else:
        return in_file
Example #8
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
         bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("r36")
    interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    cmd = [rscript, interval_file_r, "--infile", bed_file,
          "--fasta", ref_file,
          "--outfile", ready_file,
          "--offtarget",
          "--genome", genome,
          "--export", optimized_bed,
          "--mappability", mappability_resource]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                     utils.get_R_exports(env = "r36"),
                                                     " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Example #9
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Example #10
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}")
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Example #11
0
def _prep_bed(data, work_dir):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = clean_file(dd.get_variant_regions(data), data)

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
Example #12
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo",
                                     "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(
        data) or "altcontigs" in dd.get_exclude_regions(data)

    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling
                                         or chromhacks.is_nonalt(r.chrom))

    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(
                            input_regions,
                            nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Example #13
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Example #14
0
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            extra_fields.append(gene_index)
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                                   (max_distance, gene_index))
                sort_cmd = bedutils.get_sort_cmd()
                cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | "
                       "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
                       "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
                       "{distance_filter} | cut -f 1-{max_column} | "
                       "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate BED file with gene info")
        return out_file
    else:
        return in_file
Example #15
0
def _create_subset_file(in_file, work_dir, data):
    """Subset the VCF to a set of smaller regions, matching what was used for CNV calling.
    """
    out_file = os.path.join(
        work_dir,
        "%s-orig.bcf" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            region_bed = regions.get_sv_bed(data)
            if not region_bed:
                region_bed = regions.get_sv_bed(data, "transcripts1e4",
                                                work_dir)
            cmd = "bcftools view -R {region_bed} -o {tx_out_file} -O b {in_file}"
            do.run(cmd.format(**locals()),
                   "Extract SV only regions for BubbleTree")
    return out_file
Example #16
0
def get_base_cnv_regions(data, work_dir):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = regions.get_sv_bed(data)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions within 10kb of genes
        if cov_interval == "genome":
            base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)
    return base_regions
Example #17
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir,
                    data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir,
                            "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                resources = config_utils.get_resources("bcbio_prioritize",
                                                       data["config"])
                jvm_opts = " ".join(
                    resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]))
                cmd = (
                    "bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}"
                )
                do.run(cmd.format(**locals()),
                       "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000",
                                                     work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(
                        transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()),
                       "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(
            vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = (
                "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                "print CALLER,SNAME,$1,$2,I$END,"
                """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Prioritize: convert to tab delimited")
    return out_file
Example #18
0
def subset_by_genes(in_file, data, out_dir, pad):
    """Subset BED file of regions to only those within pad of the final output.
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    fai_file = ref.fasta_idx(dd.get_ref_file(data))
    if not gene_file or not utils.file_exists(in_file):
        return in_file
    else:
        out_file = os.path.join(out_dir, "%s-geneonly.bed" % utils.splitext_plus(os.path.basename(in_file))[0])
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(data, out_file) as tx_out_file:
                want_region_file = "%s-targetregions%s" % utils.splitext_plus(out_file)
                pybedtools.BedTool(gene_file).slop(g=fai_file, b=pad).merge().saveas(want_region_file)
                pybedtools.BedTool(in_file).intersect(b=want_region_file).sort().saveas(tx_out_file)
        return out_file
Example #19
0
def add_genes(in_file, data, max_distance=10000, work_dir=None):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if work_dir:
            out_file = os.path.join(work_dir, os.path.basename(out_file))
        if not utils.file_uptodate(out_file, in_file):
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                _add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, data, max_distance)
        return out_file
    else:
        return in_file
Example #20
0
def add_genes(in_file, data, max_distance=10000, work_dir=None):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if work_dir:
            out_file = os.path.join(work_dir, os.path.basename(out_file))
        if not utils.file_uptodate(out_file, in_file):
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                _add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, data, max_distance)
        return out_file
    else:
        return in_file
Example #21
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Example #22
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Example #23
0
def precall(items):
    """Perform initial pre-calling steps -- coverage calcuation by sample.

    Use sambamba to call average region coverage in regions, and convert into a correct format.
    """
    items = [utils.to_single_data(x) for x in items]
    assert len(items) == 1, "Expect one item to Seq2C coverage calculation"
    data = utils.to_single_data(items)
    assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing"

    work_dir = _sv_workdir(data)
    bed_file = regions.get_sv_bed(data) or dd.get_variant_regions(data)
    bed_file = _prep_bed(data, bed_file, work_dir)
    bam_file = dd.get_align_bam(data)
    sample_name = dd.get_sample_name(data)

    cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name)

    if "sv" not in data:
        data["sv"] = []
    data["sv"].append({"variantcaller": "seq2c",
                       "coverage": cov_file})
    return [data]
Example #24
0
def precall(items):
    """Perform initial pre-calling steps -- coverage calcuation by sample.

    Use sambamba to call average region coverage in regions, and convert into a correct format.
    """
    items = [utils.to_single_data(x) for x in items]
    assert len(items) == 1, "Expect one item to Seq2C coverage calculation"
    data = utils.to_single_data(items)
    assert dd.get_coverage_interval(
        data) != "genome", "Seq2C only for amplicon and exome sequencing"

    work_dir = _sv_workdir(data)
    bed_file = regions.get_sv_bed(data) or dd.get_variant_regions(data)
    bed_file = _prep_bed(data, bed_file, work_dir)
    bam_file = dd.get_align_bam(data)
    sample_name = dd.get_sample_name(data)

    cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file,
                                   sample_name)

    if "sv" not in data:
        data["sv"] = []
    data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file})
    return [data]