Ejemplo n.º 1
0
def _combine_files(tsv_files, work_dir, data):
    """Combine multiple priority tsv files into a final sorted output.
    """
    header = "\t".join(
        [
            "caller",
            "sample",
            "chrom",
            "start",
            "end",
            "svtype",
            "known",
            "lof",
            "annotation",
            "split_read_support",
            "paired_end_support",
        ]
    )
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-prioritize.tsv" % (sample))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            input_files = " ".join(tsv_files)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{{ echo '{header}'; cat {input_files} | {sort_cmd} -k3,3 -k4,4n; }} > {tx_out_file}"
            do.run(cmd.format(**locals()), "Combine prioritized from multiple callers")
    return out_file
Ejemplo n.º 2
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = iter(pybedtools.BedTool(in_file)).next()
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = range(4, len(input_rec.fields) + 1)
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                       (max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd()
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{sort_cmd} -k1,1 -k2,2n | "
            "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
            "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {ready_gene_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
Ejemplo n.º 3
0
def _collapse_transcripts(in_file, window, data, out_dir):
    """Collapse transcripts into min/max coordinates and optionally add windows.
    """
    if out_dir is None:
        out_dir = os.path.dirname(in_file)
    out_file = os.path.join(out_dir,
                            "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0],
                                                        window))
    chrom_sizes = {}
    for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]):
        chrom_sizes[contig.name] = contig.size
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}"
            do.run(cmd.format(**locals()), "Sort BED file by transcript name")
            with open(tx_out_file, "w") as out_handle:
                # Work around for segmentation fault issue with groupby
                # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476
                x = pybedtools.BedTool(prep_file)
                def gen():
                    for r in x:
                        yield r
                for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)):
                    rs = list(rs)
                    r = rs[0]
                    for gcoords in _group_coords(rs):
                        min_pos = max(min(gcoords) - window, 0)
                        max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom])
                        out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name))
    return bedutils.sort_merge(out_file, data)
Ejemplo n.º 4
0
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            extra_fields.append(gene_index)
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                                   (max_distance, gene_index))
                sort_cmd = bedutils.get_sort_cmd()
                cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | "
                       "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
                       "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
                       "{distance_filter} | cut -f 1-{max_column} | "
                       "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate BED file with gene info")
        return out_file
    else:
        return in_file
Ejemplo n.º 5
0
def add_genes_to_bed(in_file,
                     gene_file,
                     fai_file,
                     out_file,
                     max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    input_rec = iter(pybedtools.BedTool(in_file)).next()
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = range(4, len(input_rec.fields) + 1)
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (
        r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
        (max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd()
    cmd = (
        "{sort_cmd} -k1,1 -k2,2n {in_file} | "
        "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
        "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
        "{distance_filter} | cut -f 1-{max_column} | "
        "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
Ejemplo n.º 6
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    input_rec = iter(pybedtools.BedTool(in_file)).next()
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = range(4, len(input_rec.fields) + 1)
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                       (max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd()
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{sort_cmd} -k1,1 -k2,2n | "
            "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
            "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
Ejemplo n.º 7
0
def _combine_files(tsv_files, work_dir, data):
    """Combine multiple priority tsv files into a final sorted output.
    """
    header = "\t".join(["caller", "sample", "chrom", "start", "end", "svtype",
                        "lof", "annotation", "split_read_support", "paired_support_PE", "paired_support_PR"])
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-prioritize.tsv" % (sample))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            input_files = " ".join(tsv_files)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{{ echo '{header}'; cat {input_files} | {sort_cmd} -k3,3 -k4,4n; }} > {tx_out_file}"
            do.run(cmd.format(**locals()), "Combine prioritized from multiple callers")
    return out_file
Ejemplo n.º 8
0
def _group_by_ctype(bed_file, depth, region, region_file, out_file, data):
    """Group adjacent callable/uncallble regions into defined intervals.

    Uses tips from bedtools discussion:
    https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ
    https://gist.github.com/arq5x/b67196a46db5b63bee06
    """
    with file_transaction(data, out_file) as tx_out_file:
        min_cov = depth["min"]
        sort_cmd = bedutils.get_sort_cmd()
        cmd = (r"""cat {bed_file} | awk '{{if ($4 == 0) {{print $0"\tNO_COVERAGE"}} """
               r"""else if ($4 < {min_cov}) {{print $0"\tLOW_COVERAGE"}} """
               r"""else {{print $0"\tCALLABLE"}} }}' | """
               "bedtools groupby -prec 21 -g 1,5 -c 1,2,3,5 -o first,first,max,first | "
               "cut -f 3-6 | "
               "bedtools intersect -nonamecheck -a - -b {region_file} | "
               "{sort_cmd} -k1,1 -k2,2n  > {tx_out_file}")
        do.run(cmd.format(**locals()), "bedtools groupby coverage: %s" % (str(region)), data)
Ejemplo n.º 9
0
def _group_by_ctype(bed_file, depth, region, region_file, out_file, data):
    """Group adjacent callable/uncallble regions into defined intervals.

    Uses tips from bedtools discussion:
    https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ
    https://gist.github.com/arq5x/b67196a46db5b63bee06
    """
    with file_transaction(data, out_file) as tx_out_file:
        min_cov = depth["min"]
        sort_cmd = bedutils.get_sort_cmd()
        cmd = (
            r"""cat {bed_file} | awk '{{if ($4 == 0) {{print $0"\tNO_COVERAGE"}} """
            r"""else if ($4 < {min_cov}) {{print $0"\tLOW_COVERAGE"}} """
            r"""else {{print $0"\tCALLABLE"}} }}' | """
            "bedtools groupby -prec 21 -g 1,5 -c 1,2,3,5 -o first,first,max,first | "
            "cut -f 3-6 | "
            "bedtools intersect -nonamecheck -a - -b {region_file} | "
            "{sort_cmd} -k1,1 -k2,2n  > {tx_out_file}")
        do.run(cmd.format(**locals()),
               "bedtools groupby coverage: %s" % (str(region)), data)
Ejemplo n.º 10
0
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data,
                                   "exons",
                                   out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            extra_fields.append(gene_index)
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (
                    r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'"""
                    % (max_distance, gene_index))
                sort_cmd = bedutils.get_sort_cmd()
                cmd = (
                    "{sort_cmd} -k1,1 -k2,2n {in_file} | "
                    "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
                    "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
                    "{distance_filter} | cut -f 1-{max_column} | "
                    "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}"
                )
                do.run(cmd.format(**locals()),
                       "Annotate BED file with gene info")
        return out_file
    else:
        return in_file
Ejemplo n.º 11
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = next(iter(pybedtools.BedTool(in_file)))
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = list(range(4, len(input_rec.fields) + 1))
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" %
                       (max_distance, max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file))
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export())
    bcbio_py = sys.executable
    gsort = config_utils.get_program("gsort", data)
    cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | "
           "{gsort} - {fai_file} | "
            "bedtools closest -g {fai_file} "
            "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")