Example #1
0
def map_gff_line_to_bed(gff_line, out_folder, n_bins, bed_collection, header=""):
    """For every line produces a file with all of the rectangles to draw."""
    if not header:
        gff_string = "{}_{}_{}_{}".format(
            gff_line[0], gff_line[6], gff_line[3], gff_line[4]
        )
    else:
        gff_string = header
    diagram_table = [[0, 0, 0, 0]]
    name_table = [["", 0, 0]]
    gff_locus = utils.Locus(
        gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1],
    )

    scale_factor = n_bins / gff_locus.len()

    overlap_loci = bed_collection.get_overlap(gff_locus, sense="both")
    print(
        "IDENTIFIED {} OVERLAPPING BED LOCI FOR REGION {}".format(
            str(len(overlap_loci)), gff_line,
        )
    )

    # since beds come from multiple sources, we want to figure out how to offset them
    offset_dict = {}  # this will store each ID name
    bed_names_list = utils.uniquify([locus.id for locus in overlap_loci])
    bed_names_list.sort()
    for i in range(len(bed_names_list)):
        offset_dict[bed_names_list[i]] = (
            2 * i
        )  # offsets different categories of bed regions

    if gff_line[6] == "-":
        ref_point = int(gff_line[4])
    else:
        ref_point = int(gff_line[3])

    # fill out the name table
    for name in bed_names_list:
        offset = offset_dict[name]
        name_table.append([name, 0, 0.0 - offset])

    for bed_locus in overlap_loci:

        offset = offset_dict[bed_locus.id]

        [start, stop] = [abs(x - ref_point) * scale_factor for x in bed_locus.coords()]

        diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unparse_table(
        diagram_table,
        os.path.join(out_folder, "{}_bedDiagramTemp.txt".format(gff_string)),
        "\t",
    )
    utils.unparse_table(
        name_table,
        os.path.join(out_folder, "{}_bedNameTemp.txt".format(gff_string)),
        "\t",
    )
Example #2
0
def make_bed_collection(bed_file_list):
    """Takes in a list of bed files and makes a single huge collection.

    Each locus has as its ID the name of the bed file.

    """
    bed_loci = []
    print("MAKING BED COLLECTION FOR:")
    for bed_file in bed_file_list:

        bed_name = os.path.basename(bed_file).split(".")[0]
        print(bed_name)
        bed = utils.parse_table(bed_file, "\t")
        for line in bed:
            if len(line) >= 3:
                # check that line[0]
                if line[0][0:3] == "chr":
                    try:
                        coords = [int(line[1]), int(line[2])]
                        bed_locus = utils.Locus(
                            line[0], min(coords), max(coords), ".", bed_name
                        )
                        bed_loci.append(bed_locus)
                    except ValueError:
                        pass

        print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci))))

    return utils.LocusCollection(bed_loci, 50)
def assign_enhancer_rank(enhancer_to_gene_file,
                         enhancer_file1,
                         enhancer_file2,
                         name1,
                         name2,
                         rank_output=""):
    """Assign enhancer rank to genes.

    For all genes in the enhancer_to_gene table, assign the highest overlapping ranked enhancer
    in the other tables.

    """
    enhancer_to_gene = utils.parse_table(enhancer_to_gene_file, "\t")

    enhancer_collection1 = make_se_collection(enhancer_file1, name1, False)
    enhancer_collection2 = make_se_collection(enhancer_file2, name2, False)

    enhancer_dict1 = make_se_dict(enhancer_file1, name1, False)
    enhancer_dict2 = make_se_dict(enhancer_file2, name2, False)

    # we're going to update the enhancer_to_gene_table
    enhancer_to_gene[0] += ["{}_rank".format(name1), "{}_rank".format(name2)]
    for i in range(1, len(enhancer_to_gene)):
        line = enhancer_to_gene[i]
        locus_line = utils.Locus(line[1], line[2], line[3], ".", line[0])

        # if the enhancer doesn't exist, its ranking is dead last on the enhancer list
        enhancer1_overlap = enhancer_collection1.get_overlap(
            locus_line, "both")
        if len(enhancer1_overlap) == 0:
            enhancer1_rank = len(enhancer_collection1)
        else:
            rank_list1 = [
                enhancer_dict1[x.id]["rank"] for x in enhancer1_overlap
            ]
            enhancer1_rank = min(rank_list1)

        enhancer2_overlap = enhancer_collection2.get_overlap(
            locus_line, "both")
        if len(enhancer2_overlap) == 0:
            enhancer2_rank = len(enhancer_collection2)
        else:
            rank_list2 = [
                enhancer_dict2[x.id]["rank"] for x in enhancer2_overlap
            ]
            enhancer2_rank = min(rank_list2)
        enhancer_to_gene[i] += [enhancer1_rank, enhancer2_rank]

    if len(rank_output) == 0:
        return enhancer_to_gene
    else:
        utils.unparse_table(enhancer_to_gene, rank_output, "\t")
def make_se_collection(enhancer_file, name, super_only=True):
    """Return a locus collection from a super table."""
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    enhancer_loci = []
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            if super_only and int(line[-1]) == 0:
                break
            enhancer_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

    return utils.LocusCollection(enhancer_loci, 50)
def make_se_collection(enhancer_file, name, top=0):
    """Return a locus collection from a super table.

    Top gives the number of rows.

    """
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    super_loci = []

    ticker = 0
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            ticker += 1
            super_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

            if ticker == top:
                break

    return utils.LocusCollection(super_loci, 50)
Example #6
0
def map_collection(
    stitched_collection,
    reference_collection,
    bam_file_list,
    mapped_folder,
    output,
    ref_name,
):
    """Makes a table of factor density in a stitched locus.

    Rank table by number of loci stitched together.

    """
    print("FORMATTING TABLE")
    loci = list(stitched_collection.get_loci())

    locus_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ]]

    loci_len_list = []

    # strip out any that are in chrY
    for locus in loci:
        if locus.chr == "chrY":
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.id.split('_')[1]))
        loci_len_list.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    len_order = utils.order(loci_len_list, decreasing=True)
    ticker = 0
    for i in len_order:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        ref_enrich_size = 0
        ref_overlapping_loci = reference_collection.get_overlap(locus, "both")
        for ref_locus in ref_overlapping_loci:
            ref_enrich_size += ref_locus.len()

        try:
            stitch_count = int(locus.id.split("_")[0])
        except ValueError:
            stitch_count = 1
        coords = [int(x) for x in locus.coords()]

        locus_table.append([
            locus.id,
            locus.chr,
            min(coords),
            max(coords),
            stitch_count,
            ref_enrich_size,
        ])

    print("GETTING MAPPED DATA")
    print("USING A bam_file LIST:")
    print(bam_file_list)
    for bam_file in bam_file_list:

        bam_file_name = os.path.basename(bam_file)

        print("GETTING MAPPING DATA FOR  {}".format(bam_file))
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        mapped_gff_file = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name),
            "matrix.txt")
        print("OPENING {}".format(mapped_gff_file))

        mapped_gff = utils.parse_table(mapped_gff_file, "\t")

        signal_dict = defaultdict(float)
        print("MAKING SIGNAL DICT FOR {}".format(bam_file))
        mapped_loci = []
        for line in mapped_gff[1:]:

            chrom = line[1].split("(")[0]
            start = int(line[1].split(":")[-1].split("-")[0])
            end = int(line[1].split(":")[-1].split("-")[1])
            mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0]))
            try:
                signal_dict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print("WARNING NO SIGNAL FOR LINE:")
                print(line)
                continue

        mapped_collection = utils.LocusCollection(mapped_loci, 500)
        locus_table[0].append(bam_file_name)

        for i in range(1, len(locus_table)):
            signal = 0.0
            line = locus_table[i]
            line_locus = utils.Locus(line[1], line[2], line[3], ".")
            overlapping_regions = mapped_collection.get_overlap(line_locus,
                                                                sense="both")
            for region in overlapping_regions:
                signal += signal_dict[region.id]
            locus_table[i].append(signal)

    utils.unparse_table(locus_table, output, "\t")
Example #7
0
def map_gff_line_to_annot(
    gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header=""
):
    """For every line produces a file with all of the rectangles to draw."""
    if not header:
        gff_string = "{}_{}_{}_{}".format(
            gff_line[0], gff_line[6], gff_line[3], gff_line[4]
        )
    else:
        gff_string = header
    diagram_table = [[0, 0, 0, 0]]
    name_table = [["", 0, 0]]
    gff_locus = utils.Locus(
        gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1],
    )
    scale_factor = n_bins / gff_locus.len()
    # plotting buffer for diagrams
    plot_buffer = int(gff_locus.len() / n_bins * 20)

    overlap_loci = tx_collection.get_overlap(gff_locus, sense="both")
    gene_list = [locus.id for locus in overlap_loci]

    if gff_line[6] == "-":
        ref_point = int(gff_line[4])
    else:
        ref_point = int(gff_line[3])
    offset_collection = utils.LocusCollection([], 500)
    for gene_id in gene_list:

        gene = gene_dict[gene_id]

        print(gene.common_name())
        if len(gene.common_name()) > 1:
            name = gene.common_name()
        else:
            name = gene_id
        offset = 4 * len(offset_collection.get_overlap(gene.tx_locus()))
        offset_collection.append(
            utils.make_search_locus(gene.tx_locus(), plot_buffer, plot_buffer,)
        )
        # write the name of the gene down
        if gene.sense() == "+":
            gene_start = gene.tx_locus().start
        else:
            gene_start = gene.tx_locus().end
        gene_start = abs(gene_start - ref_point) * scale_factor
        name_table.append([name, gene_start, -2 - offset])

        # draw a line across the entire txLocus
        [start, stop] = [
            abs(x - ref_point) * scale_factor for x in gene.tx_locus().coords()
        ]
        diagram_table.append([start, -0.01 - offset, stop, 0.01 - offset])

        # now draw thin boxes for all tx_exons
        if gene.tx_exons():
            for tx_exon in gene.tx_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in tx_exon.coords()
                ]

                diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset])

        # now draw fatty boxes for the coding exons if any
        if gene.cd_exons():
            for cd_exon in gene.cd_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in cd_exon.coords()
                ]

                diagram_table.append([start, -1 - offset, stop, 1 - offset])

    utils.unparse_table(
        diagram_table,
        os.path.join(out_folder, "{}_diagramTemp.txt".format(gff_string)),
        "\t",
    )
    utils.unparse_table(
        name_table,
        os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)),
        "\t",
    )
Example #8
0
def map_bam_to_gff_line(
    bam_file, mmr, name, gff_line, color, n_bins, sense="both", extension=200
):
    """Maps reads from a bam to a gff."""

    print("using a MMR/scaling denominator value of {}".format(mmr))

    line = gff_line[0:9]
    gff_locus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1])

    # setting up the output clusterline
    color_line = color
    bam_name = os.path.basename(bam_file)
    cluster_line = [bam_name, gff_locus.id, name, gff_locus.__str__()] + color_line

    bin_size = gff_locus.len() // n_bins
    # some regions will be too short to get info on
    # we just kick these back and abandon them
    if not bin_size:
        cluster_line += ["NA"] * int(n_bins)
        return cluster_line

    # flippy flip if sense is negative
    sense_trans = str.maketrans("-+.", "+-+")
    if sense == "-":
        bam_sense = gff_locus.sense.translate(sense_trans)
    elif sense == "+":
        bam_sense = gff_locus.sense
    else:
        bam_sense = "."

    # using the bamliquidator to get the read_string
    bam_command = "bamliquidator {} {} {} {} {} {} {}".format(
        bam_file,
        gff_locus.chr,
        gff_locus.start,
        gff_locus.end,
        bam_sense,
        n_bins,
        extension,
    )
    get_reads = subprocess.Popen(
        bam_command,
        stdin=subprocess.PIPE,
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE,
        shell=True,
    )
    read_string = get_reads.communicate()
    den_list = read_string[0].decode("utf-8").split("\n")[:-1]

    # flip the denList if the actual gff region is -
    if gff_locus.sense == "-":
        den_list = den_list[::-1]

    # converting from units of total bp of read sequence per bin to rpm/bp
    den_list = [round(float(x) / bin_size / mmr, 4) for x in den_list]

    cluster_line += den_list

    return cluster_line
Example #9
0
def split_regions(input_gff, tss_collection, mask_file=None):
    """Split regions if even a single coordinate is shared with the +/-1kb."""
    # create mask regions collection
    if mask_file:
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(len(mask_collection)))

    split_gff = []
    for line in input_gff:
        chrom = line[0]
        region_id = line[1]
        line_locus = utils.Locus(line[0], line[3], line[4], ".")

        # mask regions
        if mask_file:
            if mask_collection.get_overlap(line_locus, "both"):
                continue

        overlapping_loci = tss_collection.get_overlap(line_locus)
        if overlapping_loci:  # case where a tss overlap
            # identify the parts of the line locus that are contained
            local_tss_collection = utils.LocusCollection(overlapping_loci, 50)
            overlapping_coords = line_locus.coords()
            for tss_locus in overlapping_loci:
                overlapping_coords += tss_locus.coords()

            overlapping_coords = utils.uniquify(overlapping_coords)
            overlapping_coords.sort()

            # you need to hack and slash add 1 to the last coordinate of the overlapping_coords
            overlapping_coords[-1] += 1

            i = 0
            region_ticker = 1
            while i < (len(overlapping_coords) - 1):
                start = int(overlapping_coords[i])
                stop = int(overlapping_coords[(i + 1)]) - 1
                if (stop - start) < 50:  # this eliminates really tiny regions
                    i += 1
                    continue
                split_locus = utils.Locus(chrom, start + 1, stop, ".")

                if line_locus.overlaps(split_locus):
                    new_id = "{}_{}".format(region_id, region_ticker)
                    tss_status = 0
                    if local_tss_collection.get_overlap(split_locus):
                        tss_status = 1
                    split_gff_line = [
                        chrom,
                        new_id,
                        new_id,
                        start,
                        stop,
                        "",
                        ".",
                        tss_status,
                        new_id,
                    ]

                    split_gff.append(split_gff_line)
                    region_ticker += 1
                i += 1
        else:
            line[7] = 0
            split_gff.append(line)

    return split_gff
Example #10
0
def make_peak_table(
    param_dict,
    split_gff_path,
    average_table_path,
    start_dict,
    gene_list,
    genome_directory,
    tss_window,
    distal_window,
    tads_path="",
):
    """Makes the final peak table with ebox info."""
    peak_table = [[
        "REGION_ID",
        "CHROM",
        "START",
        "STOP",
        "LENGTH",
        "TSS",
        "CPG",
        "CPG_FRACTION",
        "GC_FREQ",
        "SIGNAL",
        "CANON_EBOX_COUNT",
        "NON_CANON_EBOX_COUNT",
        "TOTAL_EBOX_COUNT",
        "OVERLAPPING_GENES",
        "PROXIMAL_GENES",
    ]]

    print("LOADING PEAK REGIONS")
    peak_gff = utils.parse_table(split_gff_path, "\t")

    print("LOADING BINDING DATA")
    signal_table = utils.parse_table(average_table_path, "\t")

    print("LOADING CPGS ISLANDS")
    cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t")
    cpg_loci = []
    for line in cpg_bed:
        cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1]))
    cpg_collection = utils.LocusCollection(cpg_loci, 50)

    print("MAKING TSS COLLECTIONS")
    if not gene_list:
        gene_list = [*start_dict]

    tss_prox_loci = []
    tss_distal_loci = []
    for ref_id in gene_list:
        tss_prox_loci.append(
            utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window))
        tss_distal_loci.append(
            utils.make_tss_locus(
                ref_id,
                start_dict,
                distal_window,
                distal_window,
            ))

    # make a 1kb flanking and 50kb flanking collection
    tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50)
    tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50)

    if tads_path:
        print("LOADING TADS FROM {}".format(tads_path))
        tad_collection = utils.import_bound_region(tads_path, "tad")
        use_tads = True

        # building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_prox_loci:
            overlapping_tads = tad_collection.get_overlap(tss_locus, "both")
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.id].append(tss_locus.id)
    else:
        use_tads = False

    print("CLASSIFYING PEAKS")
    ticker = 0

    no_tad_count = 0
    for i in range(len(peak_gff)):
        if not ticker % 1000:
            print(ticker)
        ticker += 1

        # getting the particulars of the region
        gff_line = peak_gff[i]
        peak_id = gff_line[1]
        chrom = gff_line[0]
        start = int(gff_line[3])
        stop = int(gff_line[4])
        line_locus = utils.Locus(chrom, start, stop, ".", peak_id)

        # getting the mapped signal
        signal_line = signal_table[(i + 1)]
        signal_vector = [float(x) for x in signal_line[2:]]

        # setting up the new line
        new_line = [peak_id, chrom, start, stop, line_locus.len()]

        # get the tss status from the gff itself
        # (we are able to do this nicely from the split gff code earlier)
        new_line.append(gff_line[7])

        # check cpg status
        if cpg_collection.get_overlap(line_locus, "both"):
            new_line.append(1)
        else:
            new_line.append(0)

        # now do fractional cpgoverlap
        overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both")
        overlapping_bases = 0
        for locus in overlapping_cpg_loci:
            cpg_start = max(locus.start, line_locus.start)
            cpg_end = min(locus.end, line_locus.end)
            overlapping_bases += cpg_end - cpg_start
        overlap_fraction = float(overlapping_bases) / line_locus.len()

        new_line.append(round(overlap_fraction, 2))

        # now get the seq
        line_seq = utils.fetch_seq(genome_directory, chrom, start, stop,
                                   True).upper()
        if not line_seq:
            print("UH OH")
            print(line_seq)
            print(gff_line)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gc_freq = float(line_seq.count("GC") +
                        line_seq.count("CG")) / len(line_seq)
        new_line.append(gc_freq)

        # this is where we add the ChIP-seq signal
        new_line += signal_vector

        ebox_match_list = re.findall("CA..TG", line_seq)
        if not ebox_match_list:
            new_line += [0] * 3
        else:
            total_count = len(ebox_match_list)
            canon_count = ebox_match_list.count("CACGTG")
            other_count = total_count - canon_count
            new_line += [canon_count, other_count, total_count]

        # now find the overlapping and proximal genes
        # here each overlapping gene the tss prox locus overlaps the peak

        if use_tads:
            tad_loci = tad_collection.get_overlap(line_locus, "both")

            tad_id_list = [tad_locus.id for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if not tad_genes:
                no_tad_count += 1
        else:
            tad_genes = []

        if tad_genes:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
        else:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both")
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both")
            ]

        overlapping_genes = utils.uniquify(overlapping_genes)
        # here the tss 50kb locus overlaps the peak
        # overlap takes priority over proximal
        proximal_genes = [
            gene for gene in proximal_genes
            if not overlapping_genes.count(gene)
        ]
        proximal_genes = utils.uniquify(proximal_genes)

        overlapping_string = ",".join(overlapping_genes)
        proximal_string = ",".join(proximal_genes)

        new_line += [overlapping_string, proximal_string]

        peak_table.append(new_line)

    print("Out of {} regions, {} were assigned to at least 1 tad".format(
        str(len(peak_table)),
        str(no_tad_count),
    ))
    return peak_table