コード例 #1
0
def make_bed_collection(bed_file_list):
    """Takes in a list of bed files and makes a single huge collection.

    Each locus has as its ID the name of the bed file.

    """
    bed_loci = []
    print("MAKING BED COLLECTION FOR:")
    for bed_file in bed_file_list:

        bed_name = os.path.basename(bed_file).split(".")[0]
        print(bed_name)
        bed = utils.parse_table(bed_file, "\t")
        for line in bed:
            if len(line) >= 3:
                # check that line[0]
                if line[0][0:3] == "chr":
                    try:
                        coords = [int(line[1]), int(line[2])]
                        bed_locus = utils.Locus(
                            line[0], min(coords), max(coords), ".", bed_name
                        )
                        bed_loci.append(bed_locus)
                    except ValueError:
                        pass

        print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci))))

    return utils.LocusCollection(bed_loci, 50)
コード例 #2
0
def merge_collections(name_dict, analysis_name, output="", super_only=True):
    """Merge them collections."""
    all_loci = []
    names_list = list(name_dict.keys())
    for name in names_list:
        se_collection = make_se_collection(name_dict[name]["enhancer_file"],
                                           name, super_only)
        if super_only:
            print("DATASET: {} HAS {} SUPERENHANCERS".format(
                name, str(len(se_collection))))
        else:
            print("DATASET: {} HAS {} ENHANCERS".format(
                name, str(len(se_collection))))
        all_loci += se_collection.get_loci()

    print(str(len(all_loci)))

    merged_collection = utils.LocusCollection(all_loci, 50)

    # stitch the collection together
    stitched_collection = merged_collection.stitch_collection()
    stitched_loci = stitched_collection.get_loci()
    print("IDENTIFIED {} CONSENSUS ENHANCER REGIONS".format(
        str(len(stitched_loci))))

    # sort by size and provide a unique ID
    size_list = [locus.len() for locus in stitched_loci]
    size_order = utils.order(size_list, decreasing=True)
    ordered_loci = [stitched_loci[i] for i in size_order]
    for i in range(len(ordered_loci)):
        ordered_loci[i].id = "merged_{}_{}".format(analysis_name, str(i + 1))

    merged_gff = []
    for locus in ordered_loci:
        new_line = [
            locus.chr,
            locus.id,
            "",
            locus.start,
            locus.end,
            "",
            locus.sense,
            "",
            locus.id,
        ]
        merged_gff.append(new_line)

    if len(output) == 0:
        return merged_gff
    else:
        print("writing merged gff to {}".format(output))
        utils.unparse_table(merged_gff, output, "\t")
        return output
コード例 #3
0
def merge_collections(super_file1, super_file2, name1, name2, output=""):
    """Merge them collections."""
    con_super_collection = make_se_collection(super_file1, name1)
    tnf_super_collection = make_se_collection(super_file2, name2)

    # now merge them
    merged_loci = con_super_collection.get_loci(
    ) + tnf_super_collection.get_loci()
    merged_collection = utils.LocusCollection(merged_loci, 50)

    # stitch the collection together
    stitched_collection = merged_collection.stitch_collection()
    stitched_loci = stitched_collection.get_loci()

    # loci that are in both get renamed with a new unique identifier
    renamed_loci = []
    ticker = 1
    for locus in stitched_loci:
        if len(con_super_collection.get_overlap(locus)) > 0 and len(
                tnf_super_collection.get_overlap(locus)):
            new_id = "CONSERVED_{}".format(str(ticker))
            ticker += 1
            locus.id = new_id
        else:
            locus.id = locus.id[2:]
        renamed_loci.append(locus)

    # now we turn this into a gff and write it out
    gff = utils.locus_collection_to_gff(utils.LocusCollection(
        renamed_loci, 50))

    if len(output) == 0:
        return gff
    else:
        print("writing merged gff to {}".format(output))
        utils.unparse_table(gff, output, "\t")
        return output
コード例 #4
0
def make_se_collection(enhancer_file, name, super_only=True):
    """Return a locus collection from a super table."""
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    enhancer_loci = []
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            if super_only and int(line[-1]) == 0:
                break
            enhancer_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

    return utils.LocusCollection(enhancer_loci, 50)
コード例 #5
0
def load_annot_file(genome, tss_window, gene_list=[]):
    """Load in the annotation.

    Create a start_dict and tss collection for a set of refseq IDs for a given genome.

    """
    annotation_folder = os.path.join(ROOT_DIR, "annotation")
    genome_dict = {
        "HG18": os.path.join(annotation_folder, "hg18_refseq.ucsc"),
        "MM9": os.path.join(annotation_folder, "mm9_refseq.ucsc"),
        "MM10": os.path.join(annotation_folder, "mm10_refseq.ucsc"),
        "HG19": os.path.join(annotation_folder, "hg19_refseq.ucsc"),
        "HG19_RIBO": os.path.join(annotation_folder, "hg19_refseq.ucsc"),
        "RN4": os.path.join(annotation_folder, "rn4_refseq.ucsc"),
        "RN6": os.path.join(annotation_folder, "rn6_refseq.ucsc"),
        "HG38": os.path.join(annotation_folder, "hg38_refseq.ucsc"),
    }

    mouse_convert_file = os.path.join(annotation_folder,
                                      "HMD_HumanPhenotype.rpt")

    # making a dictionary for mouse to human conversion
    mouse_convert_dict = defaultdict(str)

    mouse_convert_table = utils.parse_table(mouse_convert_file, "\t")
    for line in mouse_convert_table:
        mouse_convert_dict[line[4]] = line[0]

    annot_file = genome_dict[genome.upper()]

    start_dict = utils.make_start_dict(annot_file, gene_list)
    tss_loci = []
    if not gene_list:
        gene_list = [*start_dict]
    for gene in gene_list:
        tss_loci.append(
            utils.make_tss_locus(gene, start_dict, tss_window, tss_window))

    tss_collection = utils.LocusCollection(tss_loci, 50)

    return start_dict, tss_collection, mouse_convert_dict
コード例 #6
0
def make_se_collection(enhancer_file, name, top=0):
    """Return a locus collection from a super table.

    Top gives the number of rows.

    """
    enhancer_table = utils.parse_table(enhancer_file, "\t")
    super_loci = []

    ticker = 0
    for line in enhancer_table:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            ticker += 1
            super_loci.append(
                utils.Locus(line[1], line[2], line[3], ".",
                            "{}_{}".format(name, line[0])))

            if ticker == top:
                break

    return utils.LocusCollection(super_loci, 50)
コード例 #7
0
def map_collection(
    stitched_collection,
    reference_collection,
    bam_file_list,
    mapped_folder,
    output,
    ref_name,
):
    """Makes a table of factor density in a stitched locus.

    Rank table by number of loci stitched together.

    """
    print("FORMATTING TABLE")
    loci = list(stitched_collection.get_loci())

    locus_table = [[
        "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE"
    ]]

    loci_len_list = []

    # strip out any that are in chrY
    for locus in loci:
        if locus.chr == "chrY":
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.id.split('_')[1]))
        loci_len_list.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    len_order = utils.order(loci_len_list, decreasing=True)
    ticker = 0
    for i in len_order:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        ref_enrich_size = 0
        ref_overlapping_loci = reference_collection.get_overlap(locus, "both")
        for ref_locus in ref_overlapping_loci:
            ref_enrich_size += ref_locus.len()

        try:
            stitch_count = int(locus.id.split("_")[0])
        except ValueError:
            stitch_count = 1
        coords = [int(x) for x in locus.coords()]

        locus_table.append([
            locus.id,
            locus.chr,
            min(coords),
            max(coords),
            stitch_count,
            ref_enrich_size,
        ])

    print("GETTING MAPPED DATA")
    print("USING A bam_file LIST:")
    print(bam_file_list)
    for bam_file in bam_file_list:

        bam_file_name = os.path.basename(bam_file)

        print("GETTING MAPPING DATA FOR  {}".format(bam_file))
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        mapped_gff_file = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name),
            "matrix.txt")
        print("OPENING {}".format(mapped_gff_file))

        mapped_gff = utils.parse_table(mapped_gff_file, "\t")

        signal_dict = defaultdict(float)
        print("MAKING SIGNAL DICT FOR {}".format(bam_file))
        mapped_loci = []
        for line in mapped_gff[1:]:

            chrom = line[1].split("(")[0]
            start = int(line[1].split(":")[-1].split("-")[0])
            end = int(line[1].split(":")[-1].split("-")[1])
            mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0]))
            try:
                signal_dict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print("WARNING NO SIGNAL FOR LINE:")
                print(line)
                continue

        mapped_collection = utils.LocusCollection(mapped_loci, 500)
        locus_table[0].append(bam_file_name)

        for i in range(1, len(locus_table)):
            signal = 0.0
            line = locus_table[i]
            line_locus = utils.Locus(line[1], line[2], line[3], ".")
            overlapping_regions = mapped_collection.get_overlap(line_locus,
                                                                sense="both")
            for region in overlapping_regions:
                signal += signal_dict[region.id]
            locus_table[i].append(signal)

    utils.unparse_table(locus_table, output, "\t")
コード例 #8
0
def region_stitching(
    reference_collection,
    name,
    out_folder,
    stitch_window,
    tss_window,
    annot_file,
    remove_tss=True,
):
    """Preform region stitching."""
    print("PERFORMING REGION STITCHING")
    # first have to turn bound region file into a locus collection

    # need to make sure this names correctly... each region should have a unique name
    # reference_collection

    debug_output = []
    # filter out all bound regions that overlap the TSS of an ACTIVE GENE
    if remove_tss:

        print("REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF {}BP".
              format(str(tss_window)))
        # first make a locus collection of TSS

        start_dict = utils.make_start_dict(annot_file)

        # now makeTSS loci for active genes
        remove_ticker = 0
        # this loop makes a locus centered around +/- tss_window of transcribed genes
        # then adds it to the list tss_loci
        tss_loci = []
        for gene_id in list(start_dict.keys()):
            tss_loci.append(
                utils.make_tss_locus(gene_id, start_dict, tss_window,
                                     tss_window))

        # this turns the tss_loci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tss_collection = utils.LocusCollection(tss_loci, 50)

        # gives all the loci in reference_collection
        bound_loci = list(reference_collection.get_loci())

        # this loop will check if each bound region is contained by the TSS exclusion zone
        # this will drop out a lot of the promoter only regions that are tiny
        # typical exclusion window is around 2kb
        for locus in bound_loci:
            if len(tss_collection.get_containers(locus, "both")) > 0:

                # if true, the bound locus overlaps an active gene
                reference_collection.remove(locus)
                debug_output.append([locus.__str__(), locus.id, "CONTAINED"])
                remove_ticker += 1
        print("REMOVED {} LOCI BECAUSE THEY WERE CONTAINED BY A TSS".format(
            str(remove_ticker)))

    # reference_collection is now all enriched region loci that don't overlap an active TSS

    if stitch_window == "":
        print("DETERMINING OPTIMUM STITCHING PARAMTER")
        opt_collection = copy.deepcopy(reference_collection)
        stitch_window = optimize_stitching(opt_collection,
                                           name,
                                           out_folder,
                                           step_size=500)
    print("USING A STITCHING PARAMETER OF {}".format(stitch_window))
    stitched_collection = reference_collection.stitch_collection(
        stitch_window, "both")

    if remove_tss:
        # now replace any stitched region that overlap 2 distinct genes
        # with the original loci that were there
        fixed_loci = []
        tss_loci = []
        for gene_id in list(start_dict.keys()):
            tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 50, 50))

        # this turns the tss_loci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tss_collection = utils.LocusCollection(tss_loci, 50)
        remove_ticker = 0
        original_ticker = 0
        for stitched_locus in stitched_collection.get_loci():
            overlapping_tss_loci = tss_collection.get_overlap(
                stitched_locus, "both")
            tss_names = [
                start_dict[tss_locus.id]["name"]
                for tss_locus in overlapping_tss_loci
            ]
            tss_names = utils.uniquify(tss_names)
            if len(tss_names) > 2:

                # stitched_collection.remove(stitched_locus)
                original_loci = reference_collection.get_overlap(
                    stitched_locus, "both")
                original_ticker += len(original_loci)
                fixed_loci += original_loci
                debug_output.append([
                    stitched_locus.__str__(), stitched_locus.id, "MULTIPLE_TSS"
                ])
                remove_ticker += 1
            else:
                fixed_loci.append(stitched_locus)

        print("REMOVED {} STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs".
              format(str(remove_ticker)))
        print("ADDED BACK {} ORIGINAL LOCI".format(str(original_ticker)))
        fixed_collection = utils.LocusCollection(fixed_loci, 50)
        return fixed_collection, debug_output, stitch_window
    else:
        return stitched_collection, debug_output, stitch_window
コード例 #9
0
def main():
    """Main run call."""
    debug = False

    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help=
        ("Enter a comma separated list of .gff or .bed file of binding sites used to make "
         "enhancers"),
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="Enter a comma separated list of bams to rank by",
    )
    parser.add_argument("-o",
                        "--out",
                        dest="out",
                        required=True,
                        help="Enter an output folder")
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Provide a name for the analysis otherwise ROSE will guess",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help=
        ("Enter a comma separated list of control bams. Can either provide a single control "
         "bam for all rankby bams, or provide a control bam for each individual bam"
         ),
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("Enter a max linking distance for stitching. Default will determine optimal stitching"
         " parameter"),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"),
                                        True)

    # GETTING INPUT FILE(s)
    input_list = [
        input_file for input_file in args.input.split(",")
        if len(input_file) > 1
    ]

    # converting all input files into GFFs and moving into the GFF folder
    input_gf_list = []
    for input_file in input_list:
        # GETTING INPUT FILE
        if args.input.split(".")[-1] == "bed":
            # CONVERTING A BED TO GFF
            input_gff_name = os.path.basename(args.input)[0:-4]
            input_gff_file = os.path.join(gff_folder,
                                          "{}.gff".format(input_gff_name))
            utils.bed_to_gff(args.input, input_gff_file)
        elif args.input.split(".")[-1] == "gff":
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )
        else:
            print(
                "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )

        input_gf_list.append(input_gff_file)

    # GETTING THE LIST OF bam_fileS TO PROCESS
    # either same number of bams for rankby and control
    # or only 1 control #or none!
    # bamlist should be all rankby bams followed by control bams

    bam_file_list = []
    if args.control:
        control_bam_list = [
            bam for bam in args.control.split(",") if len(bam) > 0
        ]
        rankby_bam_list = [
            bam for bam in args.rankby.split(",") if len(bam) > 0
        ]

        if len(control_bam_list) == len(rankby_bam_list):
            # case where an equal number of backgrounds are given
            bam_file_list = rankby_bam_list + control_bam_list
        elif len(control_bam_list) == 1:
            # case where a universal background is applied
            bam_file_list = rankby_bam_list + control_bam_list * len(
                rankby_bam_list)
        else:
            print(
                "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM"
                " FOR EACH SAMPLE")
            sys.exit()
    else:
        bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0]

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE GENOME
    genome = args.genome.upper()
    print("USING {} AS THE GENOME".format(genome))

    # GETTING THE CORRECT ANNOT FILE
    try:
        annot_file = rose2_utils.genome_dict[genome]
    except KeyError:
        print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome))
        sys.exit()

    # FINDING THE ANALYSIS NAME
    if args.name:
        input_name = args.name
    else:
        input_name = os.path.basename(input_gf_list[0]).split(".")[0]
    print("USING {} AS THE ANALYSIS NAME".format(input_name))

    print("FORMATTING INPUT REGIONS")
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    # use a simpler unique region naming system
    if len(input_gf_list) == 1:
        input_gff = utils.parse_table(input_gf_list[0], "\t")
    else:
        input_loci = []
        for gff_file in input_gf_list:
            print("\tprocessing {}".format(gff_file))
            gff = utils.parse_table(gff_file, "\t")
            gff_collection = utils.gff_to_locus_collection(gff, 50)
            input_loci += gff_collection.get_loci()

        input_collection = utils.LocusCollection(input_loci, 50)
        input_collection = (input_collection.stitch_collection()
                            )  # stitches to produce unique regions

        input_gff = utils.locus_collection_to_gff(input_collection)

    formatted_gff = []
    # now number things appropriately
    for i, line in enumerate(input_gff):

        # use the coordinates to make a new id input_name_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        line_id = "{}_{}".format(input_name, str(i + 1))  # 1 indexing

        new_line = [
            chrom,
            line_id,
            line_id,
            min(coords),
            max(coords),
            "",
            sense,
            "",
            line_id,
        ]
        formatted_gff.append(new_line)

    # name of the master input gff file
    master_gff_file = os.path.join(
        gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name))
    utils.unparse_table(formatted_gff, master_gff_file, "\t")

    print("USING {} AS THE INPUT GFF".format(master_gff_file))

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bedToGFF(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        mask_collection = utils.gff_to_locus_collection(mask_gff)

        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
            len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name,
                                          str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name,
                                            str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)),
        )

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name))
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name,
                                                 bam_file_name))
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out1_file))
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file,
                mapped_out1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("FINDING AVERAGE SIGNAL AMONGST BAMS")
    meta_output_file = collapse_region_map(output_file1,
                                           input_name + "_MERGED_SIGNAL",
                                           control_bams=args.control)

    # now try the merging

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        meta_output_file,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print("CALLING GENE MAPPING")

    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)

    # for now don't use ranking bam to call top genes
    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, super_table_file))
    print(cmd)
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)

    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, stretch_table_file))
    print(cmd)
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(
        input_name)

    cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder,
                                                      superstretch_table_file)
    os.system(cmd)
コード例 #10
0
def map_gff_line_to_annot(
    gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header=""
):
    """For every line produces a file with all of the rectangles to draw."""
    if not header:
        gff_string = "{}_{}_{}_{}".format(
            gff_line[0], gff_line[6], gff_line[3], gff_line[4]
        )
    else:
        gff_string = header
    diagram_table = [[0, 0, 0, 0]]
    name_table = [["", 0, 0]]
    gff_locus = utils.Locus(
        gff_line[0], int(gff_line[3]), int(gff_line[4]), gff_line[6], gff_line[1],
    )
    scale_factor = n_bins / gff_locus.len()
    # plotting buffer for diagrams
    plot_buffer = int(gff_locus.len() / n_bins * 20)

    overlap_loci = tx_collection.get_overlap(gff_locus, sense="both")
    gene_list = [locus.id for locus in overlap_loci]

    if gff_line[6] == "-":
        ref_point = int(gff_line[4])
    else:
        ref_point = int(gff_line[3])
    offset_collection = utils.LocusCollection([], 500)
    for gene_id in gene_list:

        gene = gene_dict[gene_id]

        print(gene.common_name())
        if len(gene.common_name()) > 1:
            name = gene.common_name()
        else:
            name = gene_id
        offset = 4 * len(offset_collection.get_overlap(gene.tx_locus()))
        offset_collection.append(
            utils.make_search_locus(gene.tx_locus(), plot_buffer, plot_buffer,)
        )
        # write the name of the gene down
        if gene.sense() == "+":
            gene_start = gene.tx_locus().start
        else:
            gene_start = gene.tx_locus().end
        gene_start = abs(gene_start - ref_point) * scale_factor
        name_table.append([name, gene_start, -2 - offset])

        # draw a line across the entire txLocus
        [start, stop] = [
            abs(x - ref_point) * scale_factor for x in gene.tx_locus().coords()
        ]
        diagram_table.append([start, -0.01 - offset, stop, 0.01 - offset])

        # now draw thin boxes for all tx_exons
        if gene.tx_exons():
            for tx_exon in gene.tx_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in tx_exon.coords()
                ]

                diagram_table.append([start, -0.5 - offset, stop, 0.5 - offset])

        # now draw fatty boxes for the coding exons if any
        if gene.cd_exons():
            for cd_exon in gene.cd_exons():

                [start, stop] = [
                    abs(x - ref_point) * scale_factor for x in cd_exon.coords()
                ]

                diagram_table.append([start, -1 - offset, stop, 1 - offset])

    utils.unparse_table(
        diagram_table,
        os.path.join(out_folder, "{}_diagramTemp.txt".format(gff_string)),
        "\t",
    )
    utils.unparse_table(
        name_table,
        os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)),
        "\t",
    )
コード例 #11
0
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a comma/space separated list of .bam files to be processed.",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or genomic region e.g. chr1:+:1-1000.",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported",
        required=True,
    )

    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options
    parser.add_argument(
        "--stretch-input",
        dest="stretch_input",
        default=None,
        type=int,
        help=(
            "Stretch the input regions to a minimum length in bp, e.g. 10000 (for"
            " 10kb)"
        ),
    )
    parser.add_argument(
        "-c",
        "--color",
        dest="color",
        default=None,
        nargs="*",
        help=(
            "Enter a colon or space separated list of colors e.g. "
            "255,0,0:255,125,0, default samples the rainbow"
        ),
    )
    parser.add_argument(
        "-s",
        "--sense",
        dest="sense",
        default="both",
        help="Map to '+','-' or 'both' strands. Default maps to both.",
    )
    parser.add_argument(
        "-e",
        "--extension",
        dest="extension",
        default=200,
        help="Extends reads by n bp. Default value is 200bp",
    )
    parser.add_argument(
        "-r",
        "--rpm",
        dest="rpm",
        action="store_true",
        default=False,
        help="Normalizes density to reads per million (rpm) Default is False",
    )
    parser.add_argument(
        "-y",
        "--yScale",
        dest="y_scale",
        default="relative",
        help=(
            "Choose either relative or uniform y axis scaling. options = "
            "'relative,uniform' Default is relative scaling"
        ),
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        default=None,
        nargs="*",
        help="Enter a comma or space separated list of names for your bams",
    )
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        default="MULTIPLE",
        help=(
            "Choose either all lines on a single plot or multiple plots. options "
            "= 'SINGLE,MULTIPLE,MERGE'"
        ),
    )
    parser.add_argument(
        "-t",
        "--title",
        dest="title",
        default="",
        help=(
            "Specify a title for the output plot(s), default will be the "
            "coordinate region"
        ),
    )
    parser.add_argument(
        "-q",
        "--skip-cache",
        dest="skip_cache",
        action="store_true",
        default=False,
        help="Toggles option to skip loading annotation cache file",
    )

    parser.add_argument(
        "--scale",
        dest="scale",
        default=None,
        nargs="*",
        help=(
            "Enter a comma or space separated list of scaling factors for your "
            "bams. Default is none"
        ),
    )
    parser.add_argument(
        "--bed",
        dest="bed",
        nargs="*",
        help="Add a comma-delimited or space-delimited list of bed files to plot",
    )
    parser.add_argument(
        "--multi-page",
        dest="multi",
        action="store_true",
        default=False,
        help="If flagged will create a new pdf for each region",
    )

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument(
        "--save-temp",
        dest="save",
        action="store_true",
        default=False,
        help="If flagged will save temporary files made by bamPlot",
    )

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bam_file_list = args.bam
        if len(bam_file_list) == 1:
            bam_file_list = bam_file_list[0].split(",")

        # Make sure these are actually files & readable (!)
        for filename in bam_file_list:
            assert os.access(filename, os.R_OK)

        # bringing in any beds
        if args.bed:
            bed_file_list = args.bed
            if len(bed_file_list) == 1:
                bed_file_list = bed_file_list[0].split(",")
            print(bed_file_list)
            bed_collection = make_bed_collection(bed_file_list)
        else:
            bed_collection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {"+", "-", "."}
        if os.access(args.input, os.R_OK):
            if args.input.endswith(".bed"):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [
                            [e[0], "", args.input, e[1], e[2], "", e[5], "", ""]
                            for e in parsed_input_bed
                        ]
                except IndexError:
                    pass

                if gff is None:
                    print(
                        "Your bed doesn't have a valid sense parameter. Defaulting to both "
                        "strands, '.'"
                    )
                    # We only take chr/start/stop and ignore everything else.
                    gff = [
                        [e[0], "", args.input, e[1], e[2], "", ".", "", ""]
                        for e in parsed_input_bed
                    ]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input).split(".")[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chrom_line = args.input.split(":")
            try:
                chrom = chrom_line[0]
                sense = chrom_line[1]
            except IndexError:
                print("Invalid input line or inaccessible file. Try: chr1:.:1-5000")
                exit()
            assert sense in valid_sense_options
            [start, end] = chrom_line[2].split("-")
            if chrom[0:3] != "chr":
                print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT")
                exit()
            gff_line = [chrom, "", args.input, start, end, "", sense, "", ""]
            gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end)
            gff = [gff_line]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print(
                "Stretching inputs to a minimum of: {} bp".format(
                    str(args.stretch_input)
                )
            )
            min_length = args.stretch_input
            stretch_gff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < min_length:
                    pad = int((min_length - difference) / 2)
                    stretch_gff.append(
                        [
                            e[0],
                            e[1],
                            e[2],
                            int(e[3]) - pad,
                            int(e[4]) + pad,
                            e[5],
                            e[6],
                            e[7],
                            e[8],
                        ]
                    )
                else:
                    stretch_gff.append(e)

            gff = stretch_gff

        # Sanity test the gff object
        assert all([e[6] in valid_sense_options for e in gff])  # All strands are sane

        # bring in the genome
        genome = args.genome.upper()
        if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count(
            genome
        ):
            print(
                "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format(
                    genome,
                )
            )
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        root_folder = args.output
        try:
            os.listdir(root_folder)
        except OSError:
            print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder))
            exit()

        # Get analysis title
        if not args.title:
            title = gff_name
        else:
            title = args.title

        # make a temp folder
        temp_folder = os.path.join(root_folder, title)
        print("CREATING TEMP FOLDER {}".format(temp_folder))
        utils.format_folder(temp_folder, create=True)

        # colors
        if args.color:
            color_list = args.color
            if len(color_list) == 1:
                color_list = color_list[0].split(":")
            color_list = [x.split(",") for x in color_list]
            if len(color_list) < len(bam_file_list):
                print(
                    "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED"
                )
                # recycling the color list
                color_list += color_list * (len(bam_file_list) // len(color_list))
                color_list = color_list[: len(bam_file_list)]

        else:
            # cycles through the colors of the rainbow
            color_list = taste_the_rainbow(len(bam_file_list))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale
        if scale:
            if len(scale) == 1:
                scale = scale[0].split(",")

        y_scale = args.y_scale.upper()

        # names
        if args.names:
            names = args.names
            if len(names) == 1:
                names = names[0].split(",")

            if len(names) != len(bam_file_list):
                print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND")
                parser.print_help()
                exit()
        else:
            names = [os.path.basename(x) for x in bam_file_list]

        # plot style
        plot_style = args.plot.upper()
        if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style):
            print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style))
            parser.print_help()
            exit()

        # now run!
        summary_table_file_name = make_bam_plot_tables(
            gff,
            genome,
            bam_file_list,
            color_list,
            n_bins,
            sense,
            extension,
            rpm,
            temp_folder,
            names,
            title,
            bed_collection,
            scale,
        )
        print("{} is the summary table".format(summary_table_file_name))

        # running the R command to plot
        multi = args.multi
        out_file = os.path.join(root_folder, "{}_plots.pdf".format(title))
        r_cmd = call_r_plot(
            summary_table_file_name, out_file, y_scale, plot_style, multi
        )

        # open a bash file
        bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title))
        with open(bash_file_name, "w") as bash_file:
            bash_file.write("#!/usr/bin/bash\n")
            bash_file.write(r_cmd)
        print("Wrote R command to {}".format(bash_file_name))
        os.system("bash {}".format(bash_file_name))

        # delete temp files
        if not args.save:
            if utils.check_output(out_file, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert " " not in temp_folder
                assert temp_folder != "/"
                shutil.rmtree(temp_folder)
                print("Removing temp folder: {}".format(temp_folder))
            else:
                print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file))

    else:
        parser.print_help()
        sys.exit()
コード例 #12
0
def main():
    """Main run call."""
    debug = False
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help="Enter a .gff or .bed file of binding sites used to make enhancers",
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-o", "--out", dest="out", required=True, help="Enter an output folder"
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-b",
        "--bams",
        dest="bams",
        required=False,
        help="Enter a comma separated list of additional bam files to map to",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=(
            "Enter a max linking distance for stitching. Default will determine optimal stitching"
            " parameter"
        ),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True)

    # GETTING INPUT FILE
    if args.input.split(".")[-1] == "bed":
        # CONVERTING A BED TO GFF
        input_gff_name = args.input.split("/")[-1][0:-4]
        input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name))
        utils.bed_to_gff(args.input, input_gff_file)
    elif args.input.split(".")[-1] == "gff":
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    else:
        print(
            "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    # GETTING THE LIST OF bam_fileS TO PROCESS
    if args.control:
        bam_file_list = [args.rankby, args.control]

    else:
        bam_file_list = [args.rankby]

    if args.bams:
        bam_file_list += args.bams.split(",")
        # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control
        # bam over and over again
    # optional args

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print("USING {} AS THE INPUT GFF".format(input_gff_file))
    input_name = os.path.basename(input_gff_file).split(".")[0]

    # GETTING THE GENOME
    genome = args.genome
    print("USING {} AS THE GENOME".format(genome))

    annot_file = rose2_utils.genome_dict[genome.upper()]

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)
    print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection)))
    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(str(len(mask_collection))))
        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus
            for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print(
            "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
                str(len(reference_loci) - len(filtered_loci)), mask_file
            )
        )
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )
    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # making sure start/stop ordering are correct
    for i in range(len(stitched_gff)):

        line = stitched_gff[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)
            ),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)
            ),
        )

    # WRITING DEBUG OUTPUT TO DISK
    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)
    )
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)
        )
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print(
                "FOUND {} MAPPING DATA FOR BAM: {}".format(
                    stitched_gff_file, mapped_out1_file
                )
            )
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file, mapped_out1_folder, bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print(
                    "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
            else:
                print(
                    "ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    if args.control:
        control_name = os.path.basename(args.control)
    else:
        control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        output_file1,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, super_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, super_table_file)
        )
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, stretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, stretch_table_file)
        )
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, superstretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, superstretch_table_file)
        )
    os.system(cmd)
コード例 #13
0
def split_regions(input_gff, tss_collection, mask_file=None):
    """Split regions if even a single coordinate is shared with the +/-1kb."""
    # create mask regions collection
    if mask_file:
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(len(mask_collection)))

    split_gff = []
    for line in input_gff:
        chrom = line[0]
        region_id = line[1]
        line_locus = utils.Locus(line[0], line[3], line[4], ".")

        # mask regions
        if mask_file:
            if mask_collection.get_overlap(line_locus, "both"):
                continue

        overlapping_loci = tss_collection.get_overlap(line_locus)
        if overlapping_loci:  # case where a tss overlap
            # identify the parts of the line locus that are contained
            local_tss_collection = utils.LocusCollection(overlapping_loci, 50)
            overlapping_coords = line_locus.coords()
            for tss_locus in overlapping_loci:
                overlapping_coords += tss_locus.coords()

            overlapping_coords = utils.uniquify(overlapping_coords)
            overlapping_coords.sort()

            # you need to hack and slash add 1 to the last coordinate of the overlapping_coords
            overlapping_coords[-1] += 1

            i = 0
            region_ticker = 1
            while i < (len(overlapping_coords) - 1):
                start = int(overlapping_coords[i])
                stop = int(overlapping_coords[(i + 1)]) - 1
                if (stop - start) < 50:  # this eliminates really tiny regions
                    i += 1
                    continue
                split_locus = utils.Locus(chrom, start + 1, stop, ".")

                if line_locus.overlaps(split_locus):
                    new_id = "{}_{}".format(region_id, region_ticker)
                    tss_status = 0
                    if local_tss_collection.get_overlap(split_locus):
                        tss_status = 1
                    split_gff_line = [
                        chrom,
                        new_id,
                        new_id,
                        start,
                        stop,
                        "",
                        ".",
                        tss_status,
                        new_id,
                    ]

                    split_gff.append(split_gff_line)
                    region_ticker += 1
                i += 1
        else:
            line[7] = 0
            split_gff.append(line)

    return split_gff
コード例 #14
0
def make_peak_table(
    param_dict,
    split_gff_path,
    average_table_path,
    start_dict,
    gene_list,
    genome_directory,
    tss_window,
    distal_window,
    tads_path="",
):
    """Makes the final peak table with ebox info."""
    peak_table = [[
        "REGION_ID",
        "CHROM",
        "START",
        "STOP",
        "LENGTH",
        "TSS",
        "CPG",
        "CPG_FRACTION",
        "GC_FREQ",
        "SIGNAL",
        "CANON_EBOX_COUNT",
        "NON_CANON_EBOX_COUNT",
        "TOTAL_EBOX_COUNT",
        "OVERLAPPING_GENES",
        "PROXIMAL_GENES",
    ]]

    print("LOADING PEAK REGIONS")
    peak_gff = utils.parse_table(split_gff_path, "\t")

    print("LOADING BINDING DATA")
    signal_table = utils.parse_table(average_table_path, "\t")

    print("LOADING CPGS ISLANDS")
    cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t")
    cpg_loci = []
    for line in cpg_bed:
        cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1]))
    cpg_collection = utils.LocusCollection(cpg_loci, 50)

    print("MAKING TSS COLLECTIONS")
    if not gene_list:
        gene_list = [*start_dict]

    tss_prox_loci = []
    tss_distal_loci = []
    for ref_id in gene_list:
        tss_prox_loci.append(
            utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window))
        tss_distal_loci.append(
            utils.make_tss_locus(
                ref_id,
                start_dict,
                distal_window,
                distal_window,
            ))

    # make a 1kb flanking and 50kb flanking collection
    tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50)
    tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50)

    if tads_path:
        print("LOADING TADS FROM {}".format(tads_path))
        tad_collection = utils.import_bound_region(tads_path, "tad")
        use_tads = True

        # building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_prox_loci:
            overlapping_tads = tad_collection.get_overlap(tss_locus, "both")
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.id].append(tss_locus.id)
    else:
        use_tads = False

    print("CLASSIFYING PEAKS")
    ticker = 0

    no_tad_count = 0
    for i in range(len(peak_gff)):
        if not ticker % 1000:
            print(ticker)
        ticker += 1

        # getting the particulars of the region
        gff_line = peak_gff[i]
        peak_id = gff_line[1]
        chrom = gff_line[0]
        start = int(gff_line[3])
        stop = int(gff_line[4])
        line_locus = utils.Locus(chrom, start, stop, ".", peak_id)

        # getting the mapped signal
        signal_line = signal_table[(i + 1)]
        signal_vector = [float(x) for x in signal_line[2:]]

        # setting up the new line
        new_line = [peak_id, chrom, start, stop, line_locus.len()]

        # get the tss status from the gff itself
        # (we are able to do this nicely from the split gff code earlier)
        new_line.append(gff_line[7])

        # check cpg status
        if cpg_collection.get_overlap(line_locus, "both"):
            new_line.append(1)
        else:
            new_line.append(0)

        # now do fractional cpgoverlap
        overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both")
        overlapping_bases = 0
        for locus in overlapping_cpg_loci:
            cpg_start = max(locus.start, line_locus.start)
            cpg_end = min(locus.end, line_locus.end)
            overlapping_bases += cpg_end - cpg_start
        overlap_fraction = float(overlapping_bases) / line_locus.len()

        new_line.append(round(overlap_fraction, 2))

        # now get the seq
        line_seq = utils.fetch_seq(genome_directory, chrom, start, stop,
                                   True).upper()
        if not line_seq:
            print("UH OH")
            print(line_seq)
            print(gff_line)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gc_freq = float(line_seq.count("GC") +
                        line_seq.count("CG")) / len(line_seq)
        new_line.append(gc_freq)

        # this is where we add the ChIP-seq signal
        new_line += signal_vector

        ebox_match_list = re.findall("CA..TG", line_seq)
        if not ebox_match_list:
            new_line += [0] * 3
        else:
            total_count = len(ebox_match_list)
            canon_count = ebox_match_list.count("CACGTG")
            other_count = total_count - canon_count
            new_line += [canon_count, other_count, total_count]

        # now find the overlapping and proximal genes
        # here each overlapping gene the tss prox locus overlaps the peak

        if use_tads:
            tad_loci = tad_collection.get_overlap(line_locus, "both")

            tad_id_list = [tad_locus.id for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if not tad_genes:
                no_tad_count += 1
        else:
            tad_genes = []

        if tad_genes:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both") if tad_genes.count(locus.id)
            ]
        else:
            overlapping_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_prox_collection.get_overlap(
                    line_locus, "both")
            ]
            proximal_genes = [
                start_dict[locus.id]["name"]
                for locus in tss_distal_collection.get_overlap(
                    line_locus, "both")
            ]

        overlapping_genes = utils.uniquify(overlapping_genes)
        # here the tss 50kb locus overlaps the peak
        # overlap takes priority over proximal
        proximal_genes = [
            gene for gene in proximal_genes
            if not overlapping_genes.count(gene)
        ]
        proximal_genes = utils.uniquify(proximal_genes)

        overlapping_string = ",".join(overlapping_genes)
        proximal_string = ",".join(proximal_genes)

        new_line += [overlapping_string, proximal_string]

        peak_table.append(new_line)

    print("Out of {} regions, {} were assigned to at least 1 tad".format(
        str(len(peak_table)),
        str(no_tad_count),
    ))
    return peak_table