Example #1
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = iter(pybedtools.BedTool(in_file)).next()
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = range(4, len(input_rec.fields) + 1)
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                       (max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd()
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{sort_cmd} -k1,1 -k2,2n | "
            "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
            "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {ready_gene_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
Example #2
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = next(iter(pybedtools.BedTool(in_file)))
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = list(range(4, len(input_rec.fields) + 1))
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" %
                       (max_distance, max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file))
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export())
    bcbio_py = sys.executable
    gsort = config_utils.get_program("gsort", data)
    cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | "
           "{gsort} - {fai_file} | "
            "bedtools closest -g {fai_file} "
            "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")