Example #1
0
def collapse_region_map(region_map_file, name="", control_bams=False):
    """Take a region_map file and collapse signal into a single column.

    Also fix any stupid start/stop sorting issues. Need to take into account whether or not
    controls were used.

    """
    region_map = utils.parse_table(region_map_file, "\t")

    for n, line in enumerate(region_map):
        if n == 0:
            # new header
            if len(name) == 0:
                name = "MERGED_SIGNAL"
            region_map[n] = line[0:6] + [name]

        else:
            new_line = list(line[0:6])
            if control_bams:
                signal_line = [float(x) for x in line[6:]]
                rankby_indexes = range(0, len(signal_line) // 2, 1)
                control_indexes = range(
                    len(signal_line) // 2, len(signal_line), 1)
                meta_vector = []
                for i, j in zip(rankby_indexes, control_indexes):
                    # min signal is 0
                    meta_vector.append(max(0, signal_line[i] - signal_line[j]))
                meta_signal = numpy.mean(meta_vector)
            else:
                meta_signal = numpy.mean([float(x) for x in line[6:]])
            region_map[n] = new_line + [meta_signal]

    output_file = region_map_file.replace("REGION", "META")
    utils.unparse_table(region_map, output_file, "\t")
    return output_file
Example #2
0
def map_gff_line_to_bed(gff_line, out_folder, n_bins, bed_collection, header=""):
    """For every line produces a file with all of the rectangles to draw."""
    if not header:
        gff_string = "{}_{}_{}_{}".format(
            gff_line[0], gff_line[6], gff_line[3], gff_line[4]
        )
    else:
        gff_string = header
    diagram_table = [[0, 0, 0, 0]]
    name_table = [["", 0, 0]]
    gff_locus = utils.Locus(
        gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1],
    )

    scale_factor = n_bins / gff_locus.len()

    overlap_loci = bed_collection.get_overlap(gff_locus, sense="both")
    print(
        "IDENTIFIED {} OVERLAPPING BED LOCI FOR REGION {}".format(
            str(len(overlap_loci)), gff_line,
        )
    )

    # since beds come from multiple sources, we want to figure out how to offset them
    offset_dict = {}  # this will store each ID name
    bed_names_list = utils.uniquify([locus.id for locus in overlap_loci])
    bed_names_list.sort()
    for i in range(len(bed_names_list)):
        offset_dict[bed_names_list[i]] = (
            2 * i
        )  # offsets different categories of bed regions

    if gff_line[6] == "-":
        ref_point = int(gff_line[4])
    else:
        ref_point = int(gff_line[3])

    # fill out the name table
    for name in bed_names_list:
        offset = offset_dict[name]
        name_table.append([name, 0, 0.0 - offset])

    for bed_locus in overlap_loci:

        offset = offset_dict[bed_locus.id]

        [start, stop] = [abs(x - ref_point) * scale_factor for x in bed_locus.coords()]

        diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unparse_table(
        diagram_table,
        os.path.join(out_folder, "{}_bedDiagramTemp.txt".format(gff_string)),
        "\t",
    )
    utils.unparse_table(
        name_table,
        os.path.join(out_folder, "{}_bedNameTemp.txt".format(gff_string)),
        "\t",
    )
def make_enhancer_signal_table(name_dict, merged_region_map, median_dict,
                               analysis_name, genome, output_folder):
    """Makes a signal table.

    Each row is an enhancer and each column is the log2 background corrected signal vs. median.

    """
    # load in the region map
    region_map = utils.parse_table(merged_region_map, "\t")
    names_list = list(name_dict.keys())
    names_list.sort()
    signal_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ] + names_list]

    print("len of {} for names_list".format(len(names_list)))
    print(names_list)

    for line in region_map[1:]:
        new_line = line[0:6]
        # a little tricky here to add datasets sequentially
        i = 6  # start w/ the first column w/ data
        for name in names_list:
            if name_dict[name]["background"] is True:
                enhancer_index = int(i)
                i += 1
                control_index = int(i)
                i += 1
                try:
                    enhancer_signal = float(line[enhancer_index]) - float(
                        line[control_index])
                except IndexError:
                    print(line)
                    print(len(line))
                    print(enhancer_index)
                    print(control_index)
                    sys.exit()

            else:
                enhancer_index = int(i)
                i += 1
                enhancer_signal = float(line[enhancer_index])

            if enhancer_signal < 0:
                enhancer_signal = 0
            enhancer_signal = enhancer_signal / median_dict[name]
            new_line.append(enhancer_signal)

        signal_table.append(new_line)

    output_file = os.path.join(
        output_folder, "{}_{}_signal_table.txt".format(genome, analysis_name))
    print("WRITING MEDIAN NORMALIZED SIGNAL TABLE TO {}".format(output_file))
    utils.unparse_table(signal_table, output_file, "\t")

    return output_file
Example #4
0
def make_signal_table(
    names_list, gff_file, mapped_folder, median_norm=False, output=""
):
    """For each sample, make a dictionary keyed by locus ID."""
    signal_dict = {}
    for name in names_list:
        signal_dict[name] = defaultdict(float)

    # now start filling in the signal dict
    gff_name = os.path.basename(gff_file).split(".")[0]
    print(gff_name)
    for name in names_list:
        print("MAKING SIGNAL DICT FOR %s" % (name))

        # try opening the batch mapping output first
        mapped_file = os.path.join(
            mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name)
        )
        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            mapped_file = os.path.join(
                mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name),
            )

        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            print("ERROR NO MAPPED FILE FOUND FOR {}".format(name))
            sys.exit()

        mapped_table = utils.parse_table(mapped_file, "\t")
        if median_norm:
            median_signal = numpy.median([float(line[2]) for line in mapped_table[1:]])
        else:
            median_signal = 1

        for line in mapped_table[1:]:
            signal_dict[name][line[1]] = float(line[2]) / median_signal

    # now make the signal table
    signal_table = []
    header = ["GENE_ID", "locusLine"] + names_list
    signal_table.append(header)

    for line in mapped_table[1:]:
        locus_id = line[1]
        sig_line = line[0:2] + [signal_dict[name][locus_id] for name in names_list]
        signal_table.append(sig_line)

    if not output:
        return signal_table
    else:
        utils.unparse_table(signal_table, output, "\t")
        return signal_table
def merge_collections(name_dict, analysis_name, output="", super_only=True):
    """Merge them collections."""
    all_loci = []
    names_list = list(name_dict.keys())
    for name in names_list:
        se_collection = make_se_collection(name_dict[name]["enhancer_file"],
                                           name, super_only)
        if super_only:
            print("DATASET: {} HAS {} SUPERENHANCERS".format(
                name, str(len(se_collection))))
        else:
            print("DATASET: {} HAS {} ENHANCERS".format(
                name, str(len(se_collection))))
        all_loci += se_collection.get_loci()

    print(str(len(all_loci)))

    merged_collection = utils.LocusCollection(all_loci, 50)

    # stitch the collection together
    stitched_collection = merged_collection.stitch_collection()
    stitched_loci = stitched_collection.get_loci()
    print("IDENTIFIED {} CONSENSUS ENHANCER REGIONS".format(
        str(len(stitched_loci))))

    # sort by size and provide a unique ID
    size_list = [locus.len() for locus in stitched_loci]
    size_order = utils.order(size_list, decreasing=True)
    ordered_loci = [stitched_loci[i] for i in size_order]
    for i in range(len(ordered_loci)):
        ordered_loci[i].id = "merged_{}_{}".format(analysis_name, str(i + 1))

    merged_gff = []
    for locus in ordered_loci:
        new_line = [
            locus.chr,
            locus.id,
            "",
            locus.start,
            locus.end,
            "",
            locus.sense,
            "",
            locus.id,
        ]
        merged_gff.append(new_line)

    if len(output) == 0:
        return merged_gff
    else:
        print("writing merged gff to {}".format(output))
        utils.unparse_table(merged_gff, output, "\t")
        return output
Example #6
0
def format_data_table(data_file):
    """Formats the data_file and rewrite.

    First 3 columns are required for every line. If they aren't there the line is deleted.

    """
    print("reformatting data table")

    data_table = utils.parse_table(data_file, "\t")

    new_data_table = [
        [
            "FILE_PATH",
            "UNIQUE_ID",
            "GENOME",
            "NAME",
            "BACKGROUND",
            "ENRICHED_REGION",
            "ENRICHED_MACS",
            "COLOR",
            "FASTQ_FILE",
        ]
    ]
    # first check to make sure the table is formatted correctly
    for line in data_table[1:]:
        if len(line) < 3:
            continue
        # this spots header lines that may be out of place
        if line[0] == "FILE_PATH":
            continue
        # check if it at least has the first 3 columns filled in
        if len(line[0]) == 0 or len(line[1]) == 0 or len(line[2]) == 0:
            print("ERROR required fields missing in line")
            print(line)
        # if the first three are filled in, check to make sure there are 8 columns
        else:
            if len(line) > 3 and len(line) < 9:
                new_line = line + (8 - len(line)) * [""] + ["NA"]
                new_data_table.append(new_line)
            elif len(line) >= 9:
                new_line = line[0:9]
                new_data_table.append(new_line)

    # lower case all of the genomes
    # make the color 0,0,0 for blank lines and strip out any " marks
    for i in range(1, len(new_data_table)):
        new_data_table[i][2] = new_data_table[i][2].lower()
        color = new_data_table[i][7]
        if len(color) == 0:
            new_data_table[i][7] = "0,0,0"
    utils.unparse_table(new_data_table, data_file, "\t")

    return new_data_table
def assign_enhancer_rank(enhancer_to_gene_file,
                         enhancer_file1,
                         enhancer_file2,
                         name1,
                         name2,
                         rank_output=""):
    """Assign enhancer rank to genes.

    For all genes in the enhancer_to_gene table, assign the highest overlapping ranked enhancer
    in the other tables.

    """
    enhancer_to_gene = utils.parse_table(enhancer_to_gene_file, "\t")

    enhancer_collection1 = make_se_collection(enhancer_file1, name1, False)
    enhancer_collection2 = make_se_collection(enhancer_file2, name2, False)

    enhancer_dict1 = make_se_dict(enhancer_file1, name1, False)
    enhancer_dict2 = make_se_dict(enhancer_file2, name2, False)

    # we're going to update the enhancer_to_gene_table
    enhancer_to_gene[0] += ["{}_rank".format(name1), "{}_rank".format(name2)]
    for i in range(1, len(enhancer_to_gene)):
        line = enhancer_to_gene[i]
        locus_line = utils.Locus(line[1], line[2], line[3], ".", line[0])

        # if the enhancer doesn't exist, its ranking is dead last on the enhancer list
        enhancer1_overlap = enhancer_collection1.get_overlap(
            locus_line, "both")
        if len(enhancer1_overlap) == 0:
            enhancer1_rank = len(enhancer_collection1)
        else:
            rank_list1 = [
                enhancer_dict1[x.id]["rank"] for x in enhancer1_overlap
            ]
            enhancer1_rank = min(rank_list1)

        enhancer2_overlap = enhancer_collection2.get_overlap(
            locus_line, "both")
        if len(enhancer2_overlap) == 0:
            enhancer2_rank = len(enhancer_collection2)
        else:
            rank_list2 = [
                enhancer_dict2[x.id]["rank"] for x in enhancer2_overlap
            ]
            enhancer2_rank = min(rank_list2)
        enhancer_to_gene[i] += [enhancer1_rank, enhancer2_rank]

    if len(rank_output) == 0:
        return enhancer_to_gene
    else:
        utils.unparse_table(enhancer_to_gene, rank_output, "\t")
def merge_collections(super_file1, super_file2, name1, name2, output=""):
    """Merge them collections."""
    con_super_collection = make_se_collection(super_file1, name1)
    tnf_super_collection = make_se_collection(super_file2, name2)

    # now merge them
    merged_loci = con_super_collection.get_loci(
    ) + tnf_super_collection.get_loci()
    merged_collection = utils.LocusCollection(merged_loci, 50)

    # stitch the collection together
    stitched_collection = merged_collection.stitch_collection()
    stitched_loci = stitched_collection.get_loci()

    # loci that are in both get renamed with a new unique identifier
    renamed_loci = []
    ticker = 1
    for locus in stitched_loci:
        if len(con_super_collection.get_overlap(locus)) > 0 and len(
                tnf_super_collection.get_overlap(locus)):
            new_id = "CONSERVED_{}".format(str(ticker))
            ticker += 1
            locus.id = new_id
        else:
            locus.id = locus.id[2:]
        renamed_loci.append(locus)

    # now we turn this into a gff and write it out
    gff = utils.locus_collection_to_gff(utils.LocusCollection(
        renamed_loci, 50))

    if len(output) == 0:
        return gff
    else:
        print("writing merged gff to {}".format(output))
        utils.unparse_table(gff, output, "\t")
        return output
def finish_rank_output(
    data_file,
    rank_output,
    genome,
    merge_folder,
    merge_name,
    name1,
    name2,
    cut_off=1.5,
    window=100000,
    super_only=True,
    plot_bam=True,
):
    """Finish rank output.

    Clean up the rank output table. Make a gff of all of the gained/lost supers beyond a certain
    cut_off w/ a window. Make a list of gained genes and lost genes. Make a bed of gained loss.

    """
    data_dict = pipeline_utils.load_data_table(data_file)
    # making sure window and cut_off are int/float
    cut_off = float(cut_off)
    window = int(window)
    genome = genome.upper()

    # make the output folder
    output_folder = utils.format_folder(os.path.join(merge_folder, "output"),
                                        True)

    # bring in the old rank table
    rank_enhancer_table = utils.parse_table(rank_output, "\t")

    # make a new formatted table
    header = rank_enhancer_table[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formatted_rank_table = [header]

    # the gffs
    gained_gff = []
    lost_gff = []

    gained_window_gff = []
    lost_window_gff = []

    if super_only:
        enhancer_type = "SUPERS"
    else:
        enhancer_type = "ENHANCERS"

    # the beds
    if super_only:
        gained_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} SEs" description="{} super enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only SEs" description="{} super enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]
    else:
        gained_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=255,0,0'.format(
                genome, name2, genome, name2, name1))
        gained_bed = [[gained_track_header]]
        conserved_track_header = (
            'track name="{} {} and {} enhancers" description="{} enhancers that are found in both'
            ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2,
                                                       genome, name1, name2))
        conserved_bed = [[conserved_track_header]]

        lost_track_header = (
            'track name="{} {} only enhancers" description="{} enhancers that are found only in '
            '{} vs {}" itemRGB=On color=0,255,0'.format(
                genome, name1, genome, name1, name2))
        lost_bed = [[lost_track_header]]

    # the genes
    gene_table = [[
        "GENE",
        "ENHANCER_ID",
        "ENHANCER_CHROM",
        "ENHANCER_START",
        "ENHANCER_STOP",
        header[6],
        header[7],
        header[8],
        "STATUS",
    ]]

    for line in rank_enhancer_table[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formatted_rank_table.append(line)

        # getting the genes
        gene_list = []
        gene_list += line[9].split(",")
        gene_list += line[10].split(",")
        gene_list += line[11].split(",")
        gene_list = [x for x in gene_list if len(x) > 0]
        gene_list = utils.uniquify(gene_list)
        gene_string = ",".join(gene_list)

        bed_line = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cut_off:
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            gained_gff.append(gff_line)
            gained_window_gff.append(gff_window_line)
            gene_status = name2
            gained_bed.append(bed_line)
        # for lost
        elif float(line[6]) < (-1 * cut_off):
            gff_line = [
                line[1],
                line[0],
                "",
                line[2],
                line[3],
                "",
                ".",
                "",
                gene_string,
            ]
            gff_window_line = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                gene_string,
            ]
            lost_gff.append(gff_line)
            lost_window_gff.append(gff_window_line)
            gene_status = name1
            lost_bed.append(bed_line)
        # for conserved
        else:
            gene_status = "CONSERVED"
            conserved_bed.append(bed_line)

        # now fill in the gene Table
        for gene in gene_list:
            gene_table_line = [
                gene,
                line[0],
                line[1],
                line[2],
                line[3],
                line[6],
                line[7],
                line[8],
                gene_status,
            ]
            gene_table.append(gene_table_line)

    # concat the bed
    full_bed = gained_bed + conserved_bed + lost_bed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formatted_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_RANK_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(formatted_rank_table, formatted_filename, "\t")

    # gffs
    gff_folder = utils.format_folder(output_folder + "gff/", True)
    gff_filename_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name2.upper(),
                                            enhancer_type),
    )
    gff_filename_window_gained = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name2.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    gff_filename_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name1.upper(),
                                            enhancer_type),
    )
    gff_filename_window_lost = os.path.join(
        gff_folder,
        "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format(
            genome,
            merge_name,
            name1.upper(),
            enhancer_type,
            str(window // 1000),
            str(window // 1000),
        ),
    )

    utils.unparse_table(gained_gff, gff_filename_gained, "\t")
    utils.unparse_table(gained_window_gff, gff_filename_window_gained, "\t")

    utils.unparse_table(lost_gff, gff_filename_lost, "\t")
    utils.unparse_table(lost_window_gff, gff_filename_window_lost, "\t")

    # bed
    bed_filename = os.path.join(
        output_folder, "{}_{}_MERGED_{}.bed".format(genome, merge_name,
                                                    enhancer_type))
    utils.unparse_table(full_bed, bed_filename, "\t")

    # gene_table
    gene_filename = os.path.join(
        output_folder,
        "{}_{}_MERGED_{}_GENE_TABLE.txt".format(genome, merge_name,
                                                enhancer_type),
    )
    utils.unparse_table(gene_table, gene_filename, "\t")

    # finally, move all of the plots to the output folder
    copyfile(
        glob.glob(os.path.join(merge_folder, "{}_ROSE".format(name1),
                               "*.pdf"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_DELTA.pdf".format(genome, merge_name,
                                               enhancer_type),
        ),
    )

    copyfile(
        glob.glob(
            os.path.join(merge_folder, "{}_ROSE".format(name1),
                         "*RANK_PLOT.png"))[0],
        os.path.join(
            output_folder,
            "{}_{}_MERGED_{}_RANK_PLOT.png".format(genome, merge_name,
                                                   enhancer_type),
        ),
    )

    # now execute the bamPlot_turbo commands
    if plot_bam:
        bam1 = data_dict[name1]["bam"]
        bam2 = data_dict[name2]["bam"]
        bam_string = "{} {}".format(bam1, bam2)
        name_string = "{} {}".format(name1, name2)
        color_string = "0,0,0:100,100,100"

        if len(gained_gff) > 0:
            # gained command
            plot_title = "{}_ONLY_SE".format(name2)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # gained window command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name2, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_gained,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

        if len(lost_gff) > 0:
            # lost command
            plot_title = "{}_ONLY_SE".format(name1)
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

            # lost command
            plot_title = "{}_ONLY_SE_{}KB_WINDOW".format(
                name1, str(window // 1000))
            cmd = (
                "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p "
                "MULTIPLE".format(
                    genome,
                    bam_string,
                    gff_filename_window_lost,
                    output_folder,
                    name_string,
                    color_string,
                    plot_title,
                ))
            os.system(cmd)

    return
Example #10
0
def map_gff_line_to_annot(
    gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header=""
):
    """For every line produces a file with all of the rectangles to draw."""
    if not header:
        gff_string = "{}_{}_{}_{}".format(
            gff_line[0], gff_line[6], gff_line[3], gff_line[4]
        )
    else:
        gff_string = header
    diagram_table = [[0, 0, 0, 0]]
    name_table = [["", 0, 0]]
    gff_locus = utils.Locus(
        gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1],
    )
    scale_factor = n_bins / gff_locus.len()
    # plotting buffer for diagrams
    plot_buffer = int(gff_locus.len() / n_bins * 20)

    overlap_loci = tx_collection.get_overlap(gff_locus, sense="both")
    gene_list = [locus.id for locus in overlap_loci]

    if gff_line[6] == "-":
        ref_point = int(gff_line[4])
    else:
        ref_point = int(gff_line[3])
    offset_collection = utils.LocusCollection([], 500)
    for gene_id in gene_list:

        gene = gene_dict[gene_id]

        print(gene.common_name())
        if len(gene.common_name()) > 1:
            name = gene.common_name()
        else:
            name = gene_id
        offset = 4 * len(offset_collection.get_overlap(gene.tx_locus()))
        offset_collection.append(
            utils.make_search_locus(gene.tx_locus(), plot_buffer, plot_buffer,)
        )
        # write the name of the gene down
        if gene.sense() == "+":
            gene_start = gene.tx_locus().start
        else:
            gene_start = gene.tx_locus().end
        gene_start = abs(gene_start - ref_point) * scale_factor
        name_table.append([name, gene_start, -2 - offset])

        # draw a line across the entire txLocus
        [start, stop] = [
            abs(x - ref_point) * scale_factor for x in gene.tx_locus().coords()
        ]
        diagram_table.append([start, -0.01 - offset, stop, 0.01 - offset])

        # now draw thin boxes for all tx_exons
        if gene.tx_exons():
            for tx_exon in gene.tx_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in tx_exon.coords()
                ]

                diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset])

        # now draw fatty boxes for the coding exons if any
        if gene.cd_exons():
            for cd_exon in gene.cd_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in cd_exon.coords()
                ]

                diagram_table.append([start, -1 - offset, stop, 1 - offset])

    utils.unparse_table(
        diagram_table,
        os.path.join(out_folder, "{}_diagramTemp.txt".format(gff_string)),
        "\t",
    )
    utils.unparse_table(
        name_table,
        os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)),
        "\t",
    )
Example #11
0
def map_collection(
    stitched_collection,
    reference_collection,
    bam_file_list,
    mapped_folder,
    output,
    ref_name,
):
    """Makes a table of factor density in a stitched locus.

    Rank table by number of loci stitched together.

    """
    print("FORMATTING TABLE")
    loci = list(stitched_collection.get_loci())

    locus_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ]]

    loci_len_list = []

    # strip out any that are in chrY
    for locus in loci:
        if locus.chr == "chrY":
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.id.split('_')[1]))
        loci_len_list.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    len_order = utils.order(loci_len_list, decreasing=True)
    ticker = 0
    for i in len_order:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        ref_enrich_size = 0
        ref_overlapping_loci = reference_collection.get_overlap(locus, "both")
        for ref_locus in ref_overlapping_loci:
            ref_enrich_size += ref_locus.len()

        try:
            stitch_count = int(locus.id.split("_")[0])
        except ValueError:
            stitch_count = 1
        coords = [int(x) for x in locus.coords()]

        locus_table.append([
            locus.id,
            locus.chr,
            min(coords),
            max(coords),
            stitch_count,
            ref_enrich_size,
        ])

    print("GETTING MAPPED DATA")
    print("USING A bam_file LIST:")
    print(bam_file_list)
    for bam_file in bam_file_list:

        bam_file_name = os.path.basename(bam_file)

        print("GETTING MAPPING DATA FOR  {}".format(bam_file))
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        mapped_gff_file = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name),
            "matrix.txt")
        print("OPENING {}".format(mapped_gff_file))

        mapped_gff = utils.parse_table(mapped_gff_file, "\t")

        signal_dict = defaultdict(float)
        print("MAKING SIGNAL DICT FOR {}".format(bam_file))
        mapped_loci = []
        for line in mapped_gff[1:]:

            chrom = line[1].split("(")[0]
            start = int(line[1].split(":")[-1].split("-")[0])
            end = int(line[1].split(":")[-1].split("-")[1])
            mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0]))
            try:
                signal_dict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print("WARNING NO SIGNAL FOR LINE:")
                print(line)
                continue

        mapped_collection = utils.LocusCollection(mapped_loci, 500)
        locus_table[0].append(bam_file_name)

        for i in range(1, len(locus_table)):
            signal = 0.0
            line = locus_table[i]
            line_locus = utils.Locus(line[1], line[2], line[3], ".")
            overlapping_regions = mapped_collection.get_overlap(line_locus,
                                                                sense="both")
            for region in overlapping_regions:
                signal += signal_dict[region.id]
            locus_table[i].append(signal)

    utils.unparse_table(locus_table, output, "\t")
Example #12
0
def optimize_stitching(locus_collection, name, out_folder, step_size=500):
    """
    takes a locus collection and starts writing out stitching stats at step sized intervals
    """
    max_stitch = 15000  # set a hard wired match stitching parameter

    stitch_table = [[
        "STEP",
        "NUM_REGIONS",
        "TOTAL_CONSTIT",
        "TOTAL_REGION",
        "MEAN_CONSTIT",
        "MEDIAN_CONSTIT",
        "MEAN_REGION",
        "MEDIAN_REGION",
        "MEAN_STITCH_FRACTION",
        "MEDIAN_STITCH_FRACTION",
    ]]
    # first consolidate the collection
    locus_collection = locus_collection.stitch_collection(stitch_window=0)
    total_constit = sum([locus.len() for locus in locus_collection.get_loci()])
    step = 0
    while step <= max_stitch:

        print("Getting stitch stats for {} (bp)".format(step))
        stitch_collection = locus_collection.stitch_collection(
            stitch_window=step)
        num_regions = len(stitch_collection)
        stitch_loci = stitch_collection.get_loci()
        region_lengths = [locus.len() for locus in stitch_loci]
        total_region = sum(region_lengths)
        constit_lengths = []
        for locus in stitch_loci:
            constit_loci = locus_collection.get_overlap(locus)
            constit_lengths.append(sum([locus.len()
                                        for locus in constit_loci]))

        mean_constit = round(numpy.mean(constit_lengths), 2)
        median_constit = round(numpy.median(constit_lengths), 2)

        mean_region = round(numpy.mean(region_lengths), 2)
        median_region = round(numpy.median(region_lengths), 2)

        stitch_fractions = [
            float(constit_lengths[i]) / float(region_lengths[i])
            for i in range(len(region_lengths))
        ]
        mean_stitch_fraction = round(numpy.mean(stitch_fractions), 2)
        median_stitch_fraction = round(numpy.median(stitch_fractions), 2)

        new_line = [
            step,
            num_regions,
            total_constit,
            total_region,
            mean_constit,
            median_constit,
            mean_region,
            median_region,
            mean_stitch_fraction,
            median_stitch_fraction,
        ]

        stitch_table.append(new_line)

        step += step_size

    # write the stitch table to disk
    stitch_param_file = os.path.join(out_folder,
                                     "{}_stitch_params.tmp".format(name))
    utils.unparse_table(stitch_table, stitch_param_file, "\t")
    # call the rscript
    r_cmd = "Rscript {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_stitchOpt.R"),
        stitch_param_file,
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        name,
    )
    print(r_cmd)
    # get back the stitch parameter
    r_output = subprocess.Popen(r_cmd, stdout=subprocess.PIPE, shell=True)
    r_output_test = r_output.communicate()

    print(r_output_test)

    stitch_param = r_output_test[0].decode("utf-8").split("\n")[2]
    try:
        stitch_param = int(stitch_param)
    except ValueError:
        print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED")
        sys.exit()

    # delete? the table
    # os.system('rm -f %s' % (stitch_param_file))
    return stitch_param
Example #13
0
def main():
    """Main run method for enhancer promoter contribution tool."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a space separated list of .bam files for the main factor",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or .bed file of regions to analyze",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=(
            "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently "
            "supported"),
        required=True,
    )
    parser.add_argument(
        "-p",
        "--chrom-path",
        dest="chrom_path",
        type=str,
        help=("Provide path to a folder with a seperate fasta file for each "
              "chromosome"),
        required=True,
    )
    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=("specify a table where first column represents a list of active "
              "refseq genes"),
        required=False,
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs="*",
        help=("Enter a space separated list of .bam files for background. If "
              "flagged, will perform background subtraction"),
        required=False,
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        type=int,
        help="Define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000,
    )
    parser.add_argument(
        "-d",
        "--distal",
        dest="distal",
        type=int,
        help="Enter a window to assign distal enhancer signal. Default is 50kb",
        required=False,
        default=50000,
    )
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs="*",
        help="enter a space separated list of other bams to map to",
        required=False,
    )
    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        ("enter a root name for the analysis, otherwise will try to find the "
         "name from the input file"),
        required=False,
    )
    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        ("Run the analysis on the top N genes by total signal. Default is 5000"
         ),
        required=False,
        default=5000,
    )
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        ("Include a .bed of tad regions to restrict enhancer/gene association"
         ),
        required=False,
        default=None,
    )
    parser.add_argument(
        "--mask",
        dest="mask",
        default=None,
        help=(
            "Mask a set of regions from analysis.  Provide a .bed or .gff of "
            "masking regions"),
    )

    args = parser.parse_args()

    print(args)

    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===========I. DATA SUMMARY============\n#="
        "=====================================\n")

    # top analysis subset
    top = args.top

    # input genome
    genome = args.genome.upper()
    print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome))

    # set of bams
    bam_file_list = args.bam

    # bring in the input path
    input_path = args.input

    # try to get the input name or use the name argument
    if args.name:
        analysis_name = args.name
    else:
        analysis_name = os.path.basename(input_path).split(".")[0]

    print("USING {} AS ANALYSIS NAME".format(analysis_name))
    # setting up the output folder
    parent_folder = utils.format_folder(args.output, True)
    output_folder = utils.format_folder(
        os.path.join(parent_folder, analysis_name), True)

    print("WRITING OUTPUT TO {}".format(output_folder))

    if input_path.split(".")[-1] == "bed":
        # type is bed
        print("input in bed format, converting to gff")
        input_gff = utils.bed_to_gff(input_path)
    else:
        input_gff = utils.parse_table(input_path, "\t")

    # the tss window for proximal signal assignment
    tss_window = int(args.tss)

    # the distal window for assigning nearby enhancer signal
    distal_window = int(args.distal)

    # activity path
    if args.activity:
        activity_path = args.activity
        activity_table = utils.parse_table(activity_path, "\t")
        ref_col = 0
        # try to find the column for refseq id
        for i in range(len(
                activity_table[2])):  # use an internal row in case of header
            if str(activity_table[1][i]).count("NM_") or str(
                    activity_table[1][i]).count("NR_"):
                ref_col = i

        # now check for header
        if not str(activity_table[0][i]).count("NM_") and not str(
                activity_table[0][i]).count("NR_"):
            print("REMOVING HEADER FROM GENE TABLE:")
            print(activity_table[0])
            activity_table.pop(0)

        gene_list = [line[ref_col] for line in activity_table
                     ]  # this needs to be REFSEQ NM ID
        print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list)))

    else:
        gene_list = []

    # check if tads are being invoked
    if args.tads:
        print("LOADING TAD LOCATIONS FROM {}".format(args.tads))
        tads_path = args.tads
    else:
        tads_path = ""

    print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome))

    genome_dir = args.chrom_path

    # making a chrom_dict that is a list of all chroms with sequence
    chrom_list = utils.uniquify(
        [name.split(".")[0] for name in os.listdir(genome_dir) if name])

    # important here to define the window
    start_dict, tss_collection, mouse_convert_dict = load_annot_file(
        genome,
        tss_window,
        gene_list,
    )

    print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES")

    print(chrom_list)
    filtered_gff = [line for line in input_gff if chrom_list.count(line[0])]

    print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format(
        str(len(filtered_gff)),
        str(len(input_gff)),
    ))

    # =====================================================================================
    # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
    # =====================================================================================

    print(
        "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#="
        "=====================================\n")

    # now we need to split the input region
    print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window))
    split_gff = split_regions(filtered_gff,
                              tss_collection,
                              mask_file=args.mask)
    print(len(filtered_gff))
    print(len(split_gff))

    split_gff_path = os.path.join(output_folder,
                                  "{}_SPLIT.gff".format(analysis_name))
    utils.unparse_table(split_gff, split_gff_path, "\t")
    print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path))

    # now you have to map the bams to the gff
    print("MAPPING TO THE SPLIT GFF")
    mapped_folder = utils.format_folder(
        os.path.join(output_folder, "bam_mapping"), True)

    signal_table = map_bams(bam_file_list, split_gff_path, analysis_name,
                            mapped_folder)
    signal_table_path = os.path.join(
        output_folder, "{}_signal_table.txt".format(analysis_name))
    utils.unparse_table(signal_table, signal_table_path, "\t")

    if args.control:
        control_bam_file_list = args.control
        control_signal_table = map_bams(
            control_bam_file_list,
            split_gff_path,
            analysis_name,
            mapped_folder,
        )
        control_signal_table_path = os.path.join(
            output_folder,
            "{}_control_signal_table.txt".format(analysis_name),
        )
        utils.unparse_table(control_signal_table, control_signal_table_path,
                            "\t")

    # now create the background subtracted summarized average table
    print("CREATING AN AVERAGE SIGNAL TABLE")
    average_table = make_average_table(
        output_folder,
        analysis_name,
        use_background=args.control  # TODO: fix to True or False
    )
    average_table_path = os.path.join(
        output_folder, "{}_average_table.txt".format(analysis_name))
    utils.unparse_table(average_table, average_table_path, "\t")

    # now load up all of the cpg and other parameters to make the actual peak table

    # first check if this has already been done
    peak_table_path = os.path.join(output_folder,
                                   "{}_PEAK_TABLE.txt".format(analysis_name))
    if utils.check_output(peak_table_path, 0.1, 0.1):
        print("PEAK TABLE OUTPUT ALREADY EXISTS")
        peak_table = utils.parse_table(peak_table_path, "\t")
    else:
        peak_table = make_peak_table(
            param_dict,
            split_gff_path,
            average_table_path,
            start_dict,
            gene_list,
            genome_dir,
            tss_window,
            distal_window,
            tads_path,
        )
        utils.unparse_table(peak_table, peak_table_path, "\t")

    gene_table = make_gene_table(peak_table, analysis_name)

    gene_table_path = os.path.join(output_folder,
                                   "{}_GENE_TABLE.txt".format(analysis_name))
    utils.unparse_table(gene_table, gene_table_path, "\t")

    # if mouse, need to convert genes over
    if genome.count("MM") == 1:
        print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA")
        converted_gene_table_path = os.path.join(
            output_folder,
            "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name),
        )

        converted_gene_table = [gene_table[0]]
        for line in gene_table[1:]:
            converted_name = mouse_convert_dict[line[0]]
            if converted_name:
                converted_gene_table.append([converted_name] + line[1:])

                utils.unparse_table(converted_gene_table,
                                    converted_gene_table_path, "\t")

        gene_table_path = converted_gene_table_path
        gene_table = converted_gene_table

    # =====================================================================================
    # ===================================III. PLOTTING ====================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#=="
        "====================================\n")

    # if there are fewer genes in the gene table than the top genes, only run on all
    if len(gene_table) < int(top):
        print(
            "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO"
            "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1),
                                            str(top)))
        top = 0

    # now call the R code
    print("CALLING R PLOTTING SCRIPTS")
    call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
Example #14
0
def tf_edge_delta_out(
    crc_folder,
    bam_list,
    analysis_name,
    edge_table_path_1,
    edge_table_path_2,
    group1_list,
    group2_list,
    output="",
):
    """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks)."""
    crc_folder = utils.format_folder(crc_folder, True)
    edge_path = merge_edge_tables(
        edge_table_path_1,
        edge_table_path_2,
        os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)),
    )

    # make a gff of the edge table
    edge_table = utils.parse_table(edge_path, "\t")
    edge_gff = []
    for line in edge_table[1:]:
        gff_line = [
            line[2],
            "{}_{}".format(line[0], line[1]),
            "",
            line[3],
            line[4],
            "",
            ".",
            "",
            "{}_{}".format(line[0], line[1]),
        ]
        edge_gff.append(gff_line)

    edge_gff_path = os.path.join(crc_folder,
                                 "{}_EDGE_TABLE.gff".format(analysis_name))
    utils.unparse_table(edge_gff, edge_gff_path, "\t")

    # direct the output to the crc folder
    signal_path = os.path.join(
        crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name))

    all_group_list = group1_list + group2_list
    if not utils.check_output(signal_path, 0, 0):
        signal_table_list = pipeline_utils.map_regions(
            bam_list,
            [edge_gff_path],
            crc_folder,
            crc_folder,
            all_group_list,
            True,
            signal_path,
            extend_reads_to=100,
        )
        print(signal_table_list)
    else:
        print("Found previous signal table at {}".format(signal_path))

    # now bring in the signal table as a dictionary using the locus line as the id
    print("making log2 group1 vs group2 signal table at edges")
    signal_table = utils.parse_table(signal_path, "\t")

    # figure out columns for group1 and group2
    group1_columns = [signal_table[0].index(name) for name in group1_list]
    group2_columns = [signal_table[0].index(name) for name in group2_list]
    group1_signal_vector = []
    group2_signal_vector = []
    for line in signal_table[1:]:
        group1_signal = numpy.mean(
            [float(line[col]) for col in group1_columns])
        group2_signal = numpy.mean(
            [float(line[col]) for col in group2_columns])

        group1_signal_vector.append(group1_signal)
        group2_signal_vector.append(group2_signal)

    group1_median = numpy.median(group1_signal_vector)
    group2_median = numpy.median(group2_signal_vector)

    print("group1 median signal")
    print(group1_median)
    print("group2 median signal")
    print(group2_median)

    # now that we have the median, we can take edges where at least 1 edge is above the median
    # and both are above zero and generate a new table w/ the fold change
    signal_filtered_path = signal_path.replace(".txt", "_filtered.txt")
    if utils.check_output(signal_filtered_path, 0, 0):
        print("Found filtered signal table for edges at {}".format(
            signal_filtered_path))
        signal_table_filtered = utils.parse_table(signal_filtered_path, "\t")
    else:
        signal_table_filtered = [
            signal_table[0] +
            ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"]
        ]
        for line in signal_table[1:]:
            group1_signal = numpy.mean(
                [float(line[col]) for col in group1_columns])
            group2_signal = numpy.mean(
                [float(line[col]) for col in group2_columns])

            if (group1_signal > group1_median or group2_signal > group2_median
                ) and min(group1_signal, group2_signal) > 0:
                delta = numpy.log2(group1_signal / group2_signal)
                new_line = line + [group1_signal, group2_signal, delta]
                signal_table_filtered.append(new_line)

        utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t")

    # now get a list of all TFs in the system
    tf_list = utils.uniquify(
        [line[0].split("_")[0] for line in signal_table_filtered[1:]])
    tf_list.sort()
    print(tf_list)

    out_degree_table = [[
        "TF_NAME",
        "EDGE_COUNT",
        "DELTA_MEAN",
        "DELTA_MEDIAN",
        "DELTA_STD",
        "DELTA_SEM",
    ]]

    for tf_name in tf_list:
        print(tf_name)
        edge_vector = [
            float(line[-1]) for line in signal_table_filtered[1:]
            if line[0].split("_")[0] == tf_name
        ]

        edge_count = len(edge_vector)
        delta_mean = round(numpy.mean(edge_vector), 4)
        delta_median = round(numpy.median(edge_vector), 4)
        delta_std = round(numpy.std(edge_vector), 4)
        delta_sem = round(stats.sem(edge_vector), 4)
        tf_out_line = [
            tf_name,
            edge_count,
            delta_mean,
            delta_median,
            delta_std,
            delta_sem,
        ]
        out_degree_table.append(tf_out_line)

    # set final output
    if not output:
        output_path = os.path.join(
            crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name))
    else:
        output_path = output

    utils.unparse_table(out_degree_table, output_path, "\t")
    print(output_path)
    return output_path
Example #15
0
def main():
    """Main run call."""
    debug = False

    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help=
        ("Enter a comma separated list of .gff or .bed file of binding sites used to make "
         "enhancers"),
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="Enter a comma separated list of bams to rank by",
    )
    parser.add_argument("-o",
                        "--out",
                        dest="out",
                        required=True,
                        help="Enter an output folder")
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Provide a name for the analysis otherwise ROSE will guess",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help=
        ("Enter a comma separated list of control bams. Can either provide a single control "
         "bam for all rankby bams, or provide a control bam for each individual bam"
         ),
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("Enter a max linking distance for stitching. Default will determine optimal stitching"
         " parameter"),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"),
                                        True)

    # GETTING INPUT FILE(s)
    input_list = [
        input_file for input_file in args.input.split(",")
        if len(input_file) > 1
    ]

    # converting all input files into GFFs and moving into the GFF folder
    input_gf_list = []
    for input_file in input_list:
        # GETTING INPUT FILE
        if args.input.split(".")[-1] == "bed":
            # CONVERTING A BED TO GFF
            input_gff_name = os.path.basename(args.input)[0:-4]
            input_gff_file = os.path.join(gff_folder,
                                          "{}.gff".format(input_gff_name))
            utils.bed_to_gff(args.input, input_gff_file)
        elif args.input.split(".")[-1] == "gff":
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )
        else:
            print(
                "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )

        input_gf_list.append(input_gff_file)

    # GETTING THE LIST OF bam_fileS TO PROCESS
    # either same number of bams for rankby and control
    # or only 1 control #or none!
    # bamlist should be all rankby bams followed by control bams

    bam_file_list = []
    if args.control:
        control_bam_list = [
            bam for bam in args.control.split(",") if len(bam) > 0
        ]
        rankby_bam_list = [
            bam for bam in args.rankby.split(",") if len(bam) > 0
        ]

        if len(control_bam_list) == len(rankby_bam_list):
            # case where an equal number of backgrounds are given
            bam_file_list = rankby_bam_list + control_bam_list
        elif len(control_bam_list) == 1:
            # case where a universal background is applied
            bam_file_list = rankby_bam_list + control_bam_list * len(
                rankby_bam_list)
        else:
            print(
                "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM"
                " FOR EACH SAMPLE")
            sys.exit()
    else:
        bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0]

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE GENOME
    genome = args.genome.upper()
    print("USING {} AS THE GENOME".format(genome))

    # GETTING THE CORRECT ANNOT FILE
    try:
        annot_file = rose2_utils.genome_dict[genome]
    except KeyError:
        print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome))
        sys.exit()

    # FINDING THE ANALYSIS NAME
    if args.name:
        input_name = args.name
    else:
        input_name = os.path.basename(input_gf_list[0]).split(".")[0]
    print("USING {} AS THE ANALYSIS NAME".format(input_name))

    print("FORMATTING INPUT REGIONS")
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    # use a simpler unique region naming system
    if len(input_gf_list) == 1:
        input_gff = utils.parse_table(input_gf_list[0], "\t")
    else:
        input_loci = []
        for gff_file in input_gf_list:
            print("\tprocessing {}".format(gff_file))
            gff = utils.parse_table(gff_file, "\t")
            gff_collection = utils.gff_to_locus_collection(gff, 50)
            input_loci += gff_collection.get_loci()

        input_collection = utils.LocusCollection(input_loci, 50)
        input_collection = (input_collection.stitch_collection()
                            )  # stitches to produce unique regions

        input_gff = utils.locus_collection_to_gff(input_collection)

    formatted_gff = []
    # now number things appropriately
    for i, line in enumerate(input_gff):

        # use the coordinates to make a new id input_name_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        line_id = "{}_{}".format(input_name, str(i + 1))  # 1 indexing

        new_line = [
            chrom,
            line_id,
            line_id,
            min(coords),
            max(coords),
            "",
            sense,
            "",
            line_id,
        ]
        formatted_gff.append(new_line)

    # name of the master input gff file
    master_gff_file = os.path.join(
        gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name))
    utils.unparse_table(formatted_gff, master_gff_file, "\t")

    print("USING {} AS THE INPUT GFF".format(master_gff_file))

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bedToGFF(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        mask_collection = utils.gff_to_locus_collection(mask_gff)

        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
            len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name,
                                          str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name,
                                            str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)),
        )

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name))
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name,
                                                 bam_file_name))
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out1_file))
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file,
                mapped_out1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("FINDING AVERAGE SIGNAL AMONGST BAMS")
    meta_output_file = collapse_region_map(output_file1,
                                           input_name + "_MERGED_SIGNAL",
                                           control_bams=args.control)

    # now try the merging

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        meta_output_file,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print("CALLING GENE MAPPING")

    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)

    # for now don't use ranking bam to call top genes
    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, super_table_file))
    print(cmd)
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)

    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, stretch_table_file))
    print(cmd)
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(
        input_name)

    cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder,
                                                      superstretch_table_file)
    os.system(cmd)
Example #16
0
def main():
    """Main run call."""
    debug = False
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help="Enter a .gff or .bed file of binding sites used to make enhancers",
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-o", "--out", dest="out", required=True, help="Enter an output folder"
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-b",
        "--bams",
        dest="bams",
        required=False,
        help="Enter a comma separated list of additional bam files to map to",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=(
            "Enter a max linking distance for stitching. Default will determine optimal stitching"
            " parameter"
        ),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True)

    # GETTING INPUT FILE
    if args.input.split(".")[-1] == "bed":
        # CONVERTING A BED TO GFF
        input_gff_name = args.input.split("/")[-1][0:-4]
        input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name))
        utils.bed_to_gff(args.input, input_gff_file)
    elif args.input.split(".")[-1] == "gff":
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    else:
        print(
            "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    # GETTING THE LIST OF bam_fileS TO PROCESS
    if args.control:
        bam_file_list = [args.rankby, args.control]

    else:
        bam_file_list = [args.rankby]

    if args.bams:
        bam_file_list += args.bams.split(",")
        # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control
        # bam over and over again
    # optional args

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print("USING {} AS THE INPUT GFF".format(input_gff_file))
    input_name = os.path.basename(input_gff_file).split(".")[0]

    # GETTING THE GENOME
    genome = args.genome
    print("USING {} AS THE GENOME".format(genome))

    annot_file = rose2_utils.genome_dict[genome.upper()]

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)
    print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection)))
    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(str(len(mask_collection))))
        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus
            for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print(
            "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
                str(len(reference_loci) - len(filtered_loci)), mask_file
            )
        )
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )
    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # making sure start/stop ordering are correct
    for i in range(len(stitched_gff)):

        line = stitched_gff[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)
            ),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)
            ),
        )

    # WRITING DEBUG OUTPUT TO DISK
    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)
    )
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)
        )
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print(
                "FOUND {} MAPPING DATA FOR BAM: {}".format(
                    stitched_gff_file, mapped_out1_file
                )
            )
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file, mapped_out1_folder, bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print(
                    "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
            else:
                print(
                    "ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    if args.control:
        control_name = os.path.basename(args.control)
    else:
        control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        output_file1,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, super_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, super_table_file)
        )
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, stretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, stretch_table_file)
        )
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, superstretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, superstretch_table_file)
        )
    os.system(cmd)
Example #17
0
def make_bam_plot_tables(
    gff,
    genome,
    bam_file_list,
    color_list,
    n_bins,
    sense,
    extension,
    rpm,
    out_folder,
    names,
    title,
    bed_collection,
    scale=None,
):
    """Makes a plot table for each line of the gff mapped against all the bams in the bamList."""
    # load in the gff
    if isinstance(gff, str):
        gff = utils.parse_table(gff, "\t")

    # load in the annotation
    print("loading in annotation for {}".format(genome))
    gene_dict, tx_collection = load_annot_file(genome)

    # make an MMR dict so MMRs are only computed once
    print("Getting information about read depth in bams")
    mmr_dict = {}

    if scale:
        print("Applying scaling factors")
        scale_list = [float(x) for x in scale]
    else:
        scale_list = [1] * len(bam_file_list)

    # now iterate through the bam files
    for i, bam_file in enumerate(bam_file_list):
        # millionMappedReads
        idx_cmd = "samtools idxstats {}".format(bam_file)

        idx_pipe = subprocess.Popen(
            idx_cmd,
            stdin=subprocess.PIPE,
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            shell=True,
        )  # TODO: this does not produce an error if samtools are not installed
        idx_stats = idx_pipe.communicate()
        idx_stats = idx_stats[0].decode("utf-8").split("\n")
        idx_stats = [line.split("\t") for line in idx_stats]
        raw_count = sum([int(line[2]) for line in idx_stats[:-1]])

        # implement scaling
        read_scale_factor = scale_list[i]

        if rpm:
            mmr = round(raw_count / 1000000 / read_scale_factor, 4)
        else:
            mmr = round(1 / read_scale_factor, 4)
        mmr_dict[bam_file] = mmr

    ticker = 1
    # go line by line in the gff
    summary_table = [
        [
            "DIAGRAM_TABLE",
            "NAME_TABLE",
            "BED_DIAGRAM_TABLE",
            "BED_NAME_TABLE",
            "PLOT_TABLE",
            "CHROM",
            "ID",
            "SENSE",
            "START",
            "END",
        ]
    ]
    for gff_line in gff:
        gff_string = "line_{}_{}_{}_{}_{}_{}".format(
            ticker, gff_line[0], gff_line[1], gff_line[6], gff_line[3], gff_line[4],
        )
        ticker += 1
        print("writing the gene diagram table for region {}".format(gff_line[1]))
        map_gff_line_to_annot(
            gff_line,
            out_folder,
            n_bins,
            gene_dict,
            tx_collection,
            sense="both",
            header=gff_string,
        )
        map_gff_line_to_bed(
            gff_line, out_folder, n_bins, bed_collection, header=gff_string,
        )
        out_table = []

        out_table.append(
            ["BAM", "GENE_ID", "NAME", "LOCUSLINE", "COLOR1", "COLOR2", "COLOR3"]
            + ["bin_" + str(n) for n in range(1, int(n_bins) + 1, 1)]
        )

        for i, bam_file in enumerate(bam_file_list):
            name = names[i]
            color = color_list[i]
            print(
                "getting data for location {} in dataset {}".format(
                    gff_line[1], bam_file
                )
            )
            mmr = mmr_dict[bam_file]
            new_line = map_bam_to_gff_line(
                bam_file, mmr, name, gff_line, color, n_bins, sense, extension,
            )
            out_table.append(new_line)

        # get the gene name
        if gff_line[1] in gene_dict:
            gene_name = gene_dict[gff_line[1]].common_name()
        else:
            gene_name = gff_line[1]
        utils.unparse_table(
            out_table,
            os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string)),
            "\t",
        )
        diagram_table = os.path.join(
            out_folder, "{}_diagramTemp.txt".format(gff_string)
        )
        plot_table = os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string))
        name_table = os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string))
        bed_name_table = os.path.join(
            out_folder, "{}_bedNameTemp.txt".format(gff_string)
        )
        bed_diagram_table = os.path.join(
            out_folder, "{}_bedDiagramTemp.txt".format(gff_string)
        )
        summary_table.append(
            [
                diagram_table,
                name_table,
                bed_diagram_table,
                bed_name_table,
                plot_table,
                gff_line[0],
                gene_name,
                gff_line[6],
                gff_line[3],
                gff_line[4],
            ]
        )
    summary_table_file_name = os.path.join(out_folder, "{}_summary.txt".format(title))
    utils.unparse_table(summary_table, summary_table_file_name, "\t")
    return summary_table_file_name