Beispiel #1
0
def make_signal_table(
    names_list, gff_file, mapped_folder, median_norm=False, output=""
):
    """For each sample, make a dictionary keyed by locus ID."""
    signal_dict = {}
    for name in names_list:
        signal_dict[name] = defaultdict(float)

    # now start filling in the signal dict
    gff_name = os.path.basename(gff_file).split(".")[0]
    print(gff_name)
    for name in names_list:
        print("MAKING SIGNAL DICT FOR %s" % (name))

        # try opening the batch mapping output first
        mapped_file = os.path.join(
            mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name)
        )
        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            mapped_file = os.path.join(
                mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name),
            )

        if utils.check_output(mapped_file, 0.02, 0.02):
            print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file))
        else:
            print("ERROR NO MAPPED FILE FOUND FOR {}".format(name))
            sys.exit()

        mapped_table = utils.parse_table(mapped_file, "\t")
        if median_norm:
            median_signal = numpy.median([float(line[2]) for line in mapped_table[1:]])
        else:
            median_signal = 1

        for line in mapped_table[1:]:
            signal_dict[name][line[1]] = float(line[2]) / median_signal

    # now make the signal table
    signal_table = []
    header = ["GENE_ID", "locusLine"] + names_list
    signal_table.append(header)

    for line in mapped_table[1:]:
        locus_id = line[1]
        sig_line = line[0:2] + [signal_dict[name][locus_id] for name in names_list]
        signal_table.append(sig_line)

    if not output:
        return signal_table
    else:
        utils.unparse_table(signal_table, output, "\t")
        return signal_table
def call_merge_supers(data_file, super_file1, super_file2, name1, name2,
                      merge_name, genome, parent_folder):
    """Call ROSE2 on merged super enhancers."""
    merged_gff_file = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (
        parent_folder,
        genome.upper(),
        merge_name,
    )

    # check to make sure this hasn't been done yet
    rose_output = os.path.join(
        parent_folder,
        "{}_ROSE".format(name1),
        "{}_{}_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt".
        format(genome.upper(), merge_name),
    )

    try:
        utils.parse_table(rose_output, "\t")
        print("ROSE OUTPUT ALREADY FOUND HERE {}".format(rose_output))
        return rose_output
    except (FileNotFoundError, IOError):
        print("MERGING ENHANCER REGIONS FROM {} and {}".format(
            super_file1, super_file2))
        merged_gff = merge_collections(super_file1, super_file2, name1, name2,
                                       merged_gff_file)

        # call rose on the merged collection
        rose_bash_file = call_rose_merged(data_file, merged_gff, name1, name2,
                                          parent_folder)
        print(rose_bash_file)

        # run the bash command
        os.system("bash {}".format(rose_bash_file))

        # check for and return output
        if utils.check_output(rose_output, 1, 10):
            return rose_output
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            rose_folder = os.path.join(parent_folder, "{}_ROSE".format(name1))
            rose_file_list = [
                x for x in os.listdir(rose_folder) if x[0] != "."
            ]  # no hidden files
            if not rose_file_list:
                print("No files found in {}".format(rose_folder))
                sys.exit()

            pipeline_utils.get_file("_SuperEnhancers_ENHANCER_TO_GENE.txt",
                                    rose_file_list, rose_folder)
Beispiel #3
0
def call_r_waterfall(gene_table_path, output_folder, analysis_name, top):
    """Function to call the Rscript.

    Wait until the .cls and .gct files are created.
    Returns the paths.

    """
    r_bash_file_path = os.path.join(output_folder,
                                    "{}_R_plotting.sh".format(analysis_name))
    with open(r_bash_file_path, "w") as r_bash_file:
        r_bash_file.write("#!/usr/bin/bash\n\n")

        r_script_path = os.path.join(ROOT_DIR, "scripts",
                                     "enhancerPromoter_waterfall.R")
        r_cmd = "Rscript {} {} {} {} {}".format(
            r_script_path,
            gene_table_path,
            "{}/".format(output_folder),
            analysis_name,
            top,
        )
        r_bash_file.write(r_cmd)

    print("writing R plotting command to disk and calling %{}".format(
        r_bash_file_path))
    os.system("bash {}".format(r_bash_file_path))

    # now check for the .cls output
    cls_path = os.path.join(output_folder,
                            "{}_top_{}.cls".format(analysis_name, str(top)))

    if utils.check_output(cls_path, 0.5, 5):
        return
    else:
        print(
            "ERROR: UNABLE TO SUCCESFULLY DETECT R SCRIPT OUTPUT AT {}".format(
                cls_path))
        sys.exit()
def call_r_script(genome, output_folder, analysis_name, signal_table_file):
    """Call the R script to do clustering and heatmap."""
    cluster_table = os.path.join(
        output_folder, "{}_{}_cluster_table.txt".format(genome, analysis_name))

    r_cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "clusterEnhancer.R"),
        genome,
        output_folder + '/',  # TODO: fix R script so it does not require '/'
        analysis_name,
        signal_table_file,
    )
    print("Calling command {}".format(r_cmd))

    os.system(r_cmd)

    print("Checking for cluster table output at {}".format(cluster_table))
    if utils.check_output(cluster_table, 1, 30):

        return cluster_table

    else:
        print("ERROR: CLUSTERING TABLE FAILED TO GENERATE")
        sys.exit()
Beispiel #5
0
def map_bams_batch(
    bam_list,
    gff_list,
    mapped_folder,
    overwrite=False,
    names_list=[],
    extension=200,
    rpm=True,
):
    """For each gff maps all of the data and writes to a specific folder named after the gff.

    Can map either by cell type or by a specific name list.
    Uses bamliquidator_batch.

    """
    if not names_list:
        names_list = [os.path.basename(bam).split(".")[0] for bam in bam_list]

    for gff_file in gff_list:
        # check to make sure gff exists
        try:
            open(gff_file, "r").close()
        except IOError:
            print("ERROR: GFF FILE {} DOES NOT EXIST".format(gff_file))
            sys.exit()

        gff_name = os.path.basename(gff_file).split(".")[0]

        # see if the parent directory exists, if not make it
        mapped_folder = utils.format_folder(mapped_folder, True)
        outdir_root = utils.format_folder(os.path.join(mapped_folder, gff_name), True)

        for name, bam in zip(names_list, bam_list):
            print("mapping {} to {}".format(name, gff_file))

            # filter based on celltype
            # output for the bamliquidator command
            outdir = utils.format_folder(os.path.join(outdir_root, name), True)
            out_matrix_file = os.path.join(outdir, "matrix.txt")

            if overwrite:
                map_cmd = "bamliquidator_batch --sense . -e {} --match_bamToGFF -r {} -o {} {}".format(
                    extension, gff_file, outdir, bam,
                )
                print(map_cmd)
                os.system(map_cmd)

            else:
                try:
                    print("checking for outfile {}".format(out_matrix_file))
                    open(out_matrix_file, "r").close()
                    print("File {} Already Exists, not mapping".format(out_matrix_file))
                except IOError:
                    map_cmd = "bamliquidator_batch --sense . -e {} --match_bamToGFF -r {} -o {} {}".format(
                        extension, gff_file, outdir, bam,
                    )
                    print(map_cmd)
                    os.system(map_cmd)

    # now initiate another giant loop to check for output and rename it
    for gff_file in gff_list:
        # check to make sure gff exists
        try:
            open(gff_file, "r").close()
        except IOError:
            print("ERROR: GFF FILE {} DOES NOT EXIST".format(gff_file))
            sys.exit()

        gff_name = os.path.basename(gff_file).split(".")[0]

        # see if the parent directory exists, if not make it
        mapped_folder = utils.format_folder(mapped_folder, True)
        # the first outdir of the mapping
        outdir_root = utils.format_folder(os.path.join(mapped_folder, gff_name), True)

        for name in names_list:
            print("Checking output of {} mapping to {}".format(name, gff_file))

            outdir = utils.format_folder(os.path.join(outdir_root, name), True)
            matrix_file = os.path.join(outdir, "matrix.txt")

            # what we want the eventual outfile to look like
            out_matrix_file = os.path.join(
                outdir_root, "{}_{}.txt".format(gff_name, name)
            )

            # now make sure the matrix file exists
            try:
                open(out_matrix_file, "r").close()
            except IOError:
                if utils.check_output(matrix_file, 0.1, 2):
                    print(
                        "Renaming output {} as {}".format(matrix_file, out_matrix_file)
                    )
                    os.rename(matrix_file, out_matrix_file)
                else:
                    print(
                        "ERROR: No output found for {} mapping to {}".format(
                            name, gff_file
                        )
                    )
def map_merged_gff(data_file, name_dict, merged_gff_file, analysis_name,
                   output_folder, mask_file):
    """Call rose on the merged_gff_file for all datasets."""
    data_dict = pipeline_utils.load_data_table(data_file)
    rose_parent_folder = os.path.join(output_folder, "rose")
    utils.format_folder(rose_parent_folder, True)
    gff_name = os.path.basename(merged_gff_file).split(".")[0]
    bash_file_name = os.path.join(output_folder, "rose",
                                  "{}_roseCall.sh".format(analysis_name))
    # names_list is just the first dataset
    # extrmap will have to have all other datasets + their backgrounds

    names_list = list(name_dict.keys())
    names_list.sort()
    extra_map = []
    for name in names_list[1:]:
        if name_dict[name]["background"]:
            background_name = data_dict[name]["background"]
            if background_name in data_dict:
                extra_map += [name, background_name]
            else:
                print(
                    "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET {} FOR {}"
                    .format(background_name, name))
                sys.exit()
        else:
            extra_map += [name]

    print(extra_map)

    # first check to see if this has already been done
    merged_region_map = os.path.join(
        output_folder,
        "rose",
        "{}_ROSE".format(names_list[0]),
        "{}_0KB_STITCHED_ENHANCER_REGION_MAP.txt".format(gff_name),
    )
    print("LOOKING FOR REGION MAP AT {}".format(merged_region_map))

    if utils.check_output(merged_region_map, 1, 1):
        print("FOUND PREVIOUS REGION MAP")

        return merged_region_map

    bash_file_name = pipeline_utils.call_rose2(
        data_file,
        "",
        rose_parent_folder,
        [names_list[0]],
        extra_map,
        merged_gff_file,
        0,
        0,
        bash_file_name,
        mask=mask_file,
    )

    bash_command = "bash {}".format(bash_file_name)
    os.system(bash_command)
    print("Running enhancer mapping command:\n{}".format(bash_command))

    if utils.check_output(merged_region_map, 5, 60):
        return merged_region_map
    else:
        print(
            "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE {}.\nEXITING NOW"
            "".format(merged_gff_file))
        sys.exit()
def launch_enhancer_mapping(
    data_file,
    name_dict,
    output_folder,
    rose_folder,
    stitch,
    tss_distance,
    enhancer_type,
    mask_file="",
):
    """Launches enhancer mapping if needed from enriched region files."""
    names_list = list(name_dict.keys())

    # check to see if everything is good, if so return True and call it a day
    if len([x for x in names_list
            if len(name_dict[x]["enhancer_file"]) > 0]) == len(names_list):
        print("ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS")
        return name_dict

    # if not, have to call rose
    rose_output_folder = utils.format_folder(rose_folder, True)

    queue_list = []
    for name in names_list:
        # check to see if we need to call rose
        if name_dict[name]["enhancer_file"] == "":
            # get the enriched file
            enriched_file = name_dict[name]["enriched_file"]
            # call rose
            print("CALLING ROSE FOR {}".format(name))
            bash_file_name = pipeline_utils.call_rose2(
                data_file,
                "",
                rose_output_folder,
                [name],
                [],
                enriched_file,
                tss_distance,
                stitch,
                mask=mask_file,
            )
            print(bash_file_name)
            os.system("bash {}".format(bash_file_name))
            # add name to queue list
            queue_list.append(name)

    # define the enhancer type
    if enhancer_type == "super":
        enhancer_string = "AllEnhancers.table.txt"
    if enhancer_type == "stretch":
        enhancer_string = "AllEnhancers_Length.table.txt"
    if enhancer_type == "superstretch":
        enhancer_string = "AllEnhancers_SuperStretch.table.txt"

    # now check for completion of datasets
    for name in queue_list:
        # check for the AllEnhancers table
        enhancer_file = os.path.join(
            rose_output_folder,
            "{}_ROSE".format(name),
            "{}_peaks_{}".format(name, enhancer_string),
        )

        print("CHECKING FOR {} ROSE OUTPUT IN {}".format(name, enhancer_file))
        if utils.check_output(enhancer_file, 1, 10):

            print("FOUND ENHANCER OUTPUT FOR {}".format(name))
            name_dict[name]["enhancer_file"] = enhancer_file
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            rose_folder = os.path.join(rose_output_folder,
                                       "{}_ROSE".format(name))
            rose_file_list = [
                x for x in os.listdir(rose_folder) if x[0] != "."
            ]  # no hidden files
            if not rose_file_list:
                print("No files found in {}".format(rose_folder))
                sys.exit()
            enhancer_file = pipeline_utils.get_file(enhancer_string,
                                                    rose_file_list,
                                                    rose_folder)
            name_dict[name]["enhancer_file"] = enhancer_file

    return name_dict
Beispiel #8
0
def main():
    """Main run call."""
    debug = False

    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help=
        ("Enter a comma separated list of .gff or .bed file of binding sites used to make "
         "enhancers"),
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="Enter a comma separated list of bams to rank by",
    )
    parser.add_argument("-o",
                        "--out",
                        dest="out",
                        required=True,
                        help="Enter an output folder")
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-n",
        "--name",
        dest="name",
        required=False,
        help="Provide a name for the analysis otherwise ROSE will guess",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help=
        ("Enter a comma separated list of control bams. Can either provide a single control "
         "bam for all rankby bams, or provide a control bam for each individual bam"
         ),
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=
        ("Enter a max linking distance for stitching. Default will determine optimal stitching"
         " parameter"),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"),
                                        True)

    # GETTING INPUT FILE(s)
    input_list = [
        input_file for input_file in args.input.split(",")
        if len(input_file) > 1
    ]

    # converting all input files into GFFs and moving into the GFF folder
    input_gf_list = []
    for input_file in input_list:
        # GETTING INPUT FILE
        if args.input.split(".")[-1] == "bed":
            # CONVERTING A BED TO GFF
            input_gff_name = os.path.basename(args.input)[0:-4]
            input_gff_file = os.path.join(gff_folder,
                                          "{}.gff".format(input_gff_name))
            utils.bed_to_gff(args.input, input_gff_file)
        elif args.input.split(".")[-1] == "gff":
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )
        else:
            print(
                "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            input_gff_file = args.input
            copyfile(
                input_gff_file,
                os.path.join(gff_folder, os.path.basename(input_gff_file)),
            )

        input_gf_list.append(input_gff_file)

    # GETTING THE LIST OF bam_fileS TO PROCESS
    # either same number of bams for rankby and control
    # or only 1 control #or none!
    # bamlist should be all rankby bams followed by control bams

    bam_file_list = []
    if args.control:
        control_bam_list = [
            bam for bam in args.control.split(",") if len(bam) > 0
        ]
        rankby_bam_list = [
            bam for bam in args.rankby.split(",") if len(bam) > 0
        ]

        if len(control_bam_list) == len(rankby_bam_list):
            # case where an equal number of backgrounds are given
            bam_file_list = rankby_bam_list + control_bam_list
        elif len(control_bam_list) == 1:
            # case where a universal background is applied
            bam_file_list = rankby_bam_list + control_bam_list * len(
                rankby_bam_list)
        else:
            print(
                "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM"
                " FOR EACH SAMPLE")
            sys.exit()
    else:
        bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0]

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE GENOME
    genome = args.genome.upper()
    print("USING {} AS THE GENOME".format(genome))

    # GETTING THE CORRECT ANNOT FILE
    try:
        annot_file = rose2_utils.genome_dict[genome]
    except KeyError:
        print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome))
        sys.exit()

    # FINDING THE ANALYSIS NAME
    if args.name:
        input_name = args.name
    else:
        input_name = os.path.basename(input_gf_list[0]).split(".")[0]
    print("USING {} AS THE ANALYSIS NAME".format(input_name))

    print("FORMATTING INPUT REGIONS")
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    # use a simpler unique region naming system
    if len(input_gf_list) == 1:
        input_gff = utils.parse_table(input_gf_list[0], "\t")
    else:
        input_loci = []
        for gff_file in input_gf_list:
            print("\tprocessing {}".format(gff_file))
            gff = utils.parse_table(gff_file, "\t")
            gff_collection = utils.gff_to_locus_collection(gff, 50)
            input_loci += gff_collection.get_loci()

        input_collection = utils.LocusCollection(input_loci, 50)
        input_collection = (input_collection.stitch_collection()
                            )  # stitches to produce unique regions

        input_gff = utils.locus_collection_to_gff(input_collection)

    formatted_gff = []
    # now number things appropriately
    for i, line in enumerate(input_gff):

        # use the coordinates to make a new id input_name_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        line_id = "{}_{}".format(input_name, str(i + 1))  # 1 indexing

        new_line = [
            chrom,
            line_id,
            line_id,
            min(coords),
            max(coords),
            "",
            sense,
            "",
            line_id,
        ]
        formatted_gff.append(new_line)

    # name of the master input gff file
    master_gff_file = os.path.join(
        gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name))
    utils.unparse_table(formatted_gff, master_gff_file, "\t")

    print("USING {} AS THE INPUT GFF".format(master_gff_file))

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bedToGFF(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        mask_collection = utils.gff_to_locus_collection(mask_gff)

        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
            len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name,
                                          str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name,
                                            str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000))
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)),
        )

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name))
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name,
                                                 bam_file_name))
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out1_file))
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file,
                mapped_out1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    stitched_gff_file, bam_file_name))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("FINDING AVERAGE SIGNAL AMONGST BAMS")
    meta_output_file = collapse_region_map(output_file1,
                                           input_name + "_MERGED_SIGNAL",
                                           control_bams=args.control)

    # now try the merging

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        meta_output_file,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print("CALLING GENE MAPPING")

    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)

    # for now don't use ranking bam to call top genes
    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, super_table_file))
    print(cmd)
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)

    cmd = "ROSE2_geneMapper -g {} -i {} -f".format(
        genome, os.path.join(out_folder, stretch_table_file))
    print(cmd)
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(
        input_name)

    cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder,
                                                      superstretch_table_file)
    os.system(cmd)
Beispiel #9
0
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a comma/space separated list of .bam files to be processed.",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or genomic region e.g. chr1:+:1-1000.",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported",
        required=True,
    )

    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options
    parser.add_argument(
        "--stretch-input",
        dest="stretch_input",
        default=None,
        type=int,
        help=(
            "Stretch the input regions to a minimum length in bp, e.g. 10000 (for"
            " 10kb)"
        ),
    )
    parser.add_argument(
        "-c",
        "--color",
        dest="color",
        default=None,
        nargs="*",
        help=(
            "Enter a colon or space separated list of colors e.g. "
            "255,0,0:255,125,0, default samples the rainbow"
        ),
    )
    parser.add_argument(
        "-s",
        "--sense",
        dest="sense",
        default="both",
        help="Map to '+','-' or 'both' strands. Default maps to both.",
    )
    parser.add_argument(
        "-e",
        "--extension",
        dest="extension",
        default=200,
        help="Extends reads by n bp. Default value is 200bp",
    )
    parser.add_argument(
        "-r",
        "--rpm",
        dest="rpm",
        action="store_true",
        default=False,
        help="Normalizes density to reads per million (rpm) Default is False",
    )
    parser.add_argument(
        "-y",
        "--yScale",
        dest="y_scale",
        default="relative",
        help=(
            "Choose either relative or uniform y axis scaling. options = "
            "'relative,uniform' Default is relative scaling"
        ),
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        default=None,
        nargs="*",
        help="Enter a comma or space separated list of names for your bams",
    )
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        default="MULTIPLE",
        help=(
            "Choose either all lines on a single plot or multiple plots. options "
            "= 'SINGLE,MULTIPLE,MERGE'"
        ),
    )
    parser.add_argument(
        "-t",
        "--title",
        dest="title",
        default="",
        help=(
            "Specify a title for the output plot(s), default will be the "
            "coordinate region"
        ),
    )
    parser.add_argument(
        "-q",
        "--skip-cache",
        dest="skip_cache",
        action="store_true",
        default=False,
        help="Toggles option to skip loading annotation cache file",
    )

    parser.add_argument(
        "--scale",
        dest="scale",
        default=None,
        nargs="*",
        help=(
            "Enter a comma or space separated list of scaling factors for your "
            "bams. Default is none"
        ),
    )
    parser.add_argument(
        "--bed",
        dest="bed",
        nargs="*",
        help="Add a comma-delimited or space-delimited list of bed files to plot",
    )
    parser.add_argument(
        "--multi-page",
        dest="multi",
        action="store_true",
        default=False,
        help="If flagged will create a new pdf for each region",
    )

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument(
        "--save-temp",
        dest="save",
        action="store_true",
        default=False,
        help="If flagged will save temporary files made by bamPlot",
    )

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bam_file_list = args.bam
        if len(bam_file_list) == 1:
            bam_file_list = bam_file_list[0].split(",")

        # Make sure these are actually files & readable (!)
        for filename in bam_file_list:
            assert os.access(filename, os.R_OK)

        # bringing in any beds
        if args.bed:
            bed_file_list = args.bed
            if len(bed_file_list) == 1:
                bed_file_list = bed_file_list[0].split(",")
            print(bed_file_list)
            bed_collection = make_bed_collection(bed_file_list)
        else:
            bed_collection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {"+", "-", "."}
        if os.access(args.input, os.R_OK):
            if args.input.endswith(".bed"):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [
                            [e[0], "", args.input, e[1], e[2], "", e[5], "", ""]
                            for e in parsed_input_bed
                        ]
                except IndexError:
                    pass

                if gff is None:
                    print(
                        "Your bed doesn't have a valid sense parameter. Defaulting to both "
                        "strands, '.'"
                    )
                    # We only take chr/start/stop and ignore everything else.
                    gff = [
                        [e[0], "", args.input, e[1], e[2], "", ".", "", ""]
                        for e in parsed_input_bed
                    ]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parse_table(args.input, "\t")
                gff_name = os.path.basename(args.input).split(".")[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chrom_line = args.input.split(":")
            try:
                chrom = chrom_line[0]
                sense = chrom_line[1]
            except IndexError:
                print("Invalid input line or inaccessible file. Try: chr1:.:1-5000")
                exit()
            assert sense in valid_sense_options
            [start, end] = chrom_line[2].split("-")
            if chrom[0:3] != "chr":
                print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT")
                exit()
            gff_line = [chrom, "", args.input, start, end, "", sense, "", ""]
            gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end)
            gff = [gff_line]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print(
                "Stretching inputs to a minimum of: {} bp".format(
                    str(args.stretch_input)
                )
            )
            min_length = args.stretch_input
            stretch_gff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < min_length:
                    pad = int((min_length - difference) / 2)
                    stretch_gff.append(
                        [
                            e[0],
                            e[1],
                            e[2],
                            int(e[3]) - pad,
                            int(e[4]) + pad,
                            e[5],
                            e[6],
                            e[7],
                            e[8],
                        ]
                    )
                else:
                    stretch_gff.append(e)

            gff = stretch_gff

        # Sanity test the gff object
        assert all([e[6] in valid_sense_options for e in gff])  # All strands are sane

        # bring in the genome
        genome = args.genome.upper()
        if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count(
            genome
        ):
            print(
                "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format(
                    genome,
                )
            )
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        root_folder = args.output
        try:
            os.listdir(root_folder)
        except OSError:
            print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder))
            exit()

        # Get analysis title
        if not args.title:
            title = gff_name
        else:
            title = args.title

        # make a temp folder
        temp_folder = os.path.join(root_folder, title)
        print("CREATING TEMP FOLDER {}".format(temp_folder))
        utils.format_folder(temp_folder, create=True)

        # colors
        if args.color:
            color_list = args.color
            if len(color_list) == 1:
                color_list = color_list[0].split(":")
            color_list = [x.split(",") for x in color_list]
            if len(color_list) < len(bam_file_list):
                print(
                    "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED"
                )
                # recycling the color list
                color_list += color_list * (len(bam_file_list) // len(color_list))
                color_list = color_list[: len(bam_file_list)]

        else:
            # cycles through the colors of the rainbow
            color_list = taste_the_rainbow(len(bam_file_list))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale
        if scale:
            if len(scale) == 1:
                scale = scale[0].split(",")

        y_scale = args.y_scale.upper()

        # names
        if args.names:
            names = args.names
            if len(names) == 1:
                names = names[0].split(",")

            if len(names) != len(bam_file_list):
                print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND")
                parser.print_help()
                exit()
        else:
            names = [os.path.basename(x) for x in bam_file_list]

        # plot style
        plot_style = args.plot.upper()
        if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style):
            print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style))
            parser.print_help()
            exit()

        # now run!
        summary_table_file_name = make_bam_plot_tables(
            gff,
            genome,
            bam_file_list,
            color_list,
            n_bins,
            sense,
            extension,
            rpm,
            temp_folder,
            names,
            title,
            bed_collection,
            scale,
        )
        print("{} is the summary table".format(summary_table_file_name))

        # running the R command to plot
        multi = args.multi
        out_file = os.path.join(root_folder, "{}_plots.pdf".format(title))
        r_cmd = call_r_plot(
            summary_table_file_name, out_file, y_scale, plot_style, multi
        )

        # open a bash file
        bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title))
        with open(bash_file_name, "w") as bash_file:
            bash_file.write("#!/usr/bin/bash\n")
            bash_file.write(r_cmd)
        print("Wrote R command to {}".format(bash_file_name))
        os.system("bash {}".format(bash_file_name))

        # delete temp files
        if not args.save:
            if utils.check_output(out_file, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert " " not in temp_folder
                assert temp_folder != "/"
                shutil.rmtree(temp_folder)
                print("Removing temp folder: {}".format(temp_folder))
            else:
                print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file))

    else:
        parser.print_help()
        sys.exit()
Beispiel #10
0
def main():
    """Main run call."""
    debug = False
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-i",
        "--i",
        dest="input",
        required=True,
        help="Enter a .gff or .bed file of binding sites used to make enhancers",
    )
    parser.add_argument(
        "-r",
        "--rankby",
        dest="rankby",
        required=True,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-o", "--out", dest="out", required=True, help="Enter an output folder"
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (MM9,MM8,HG18,HG19)",
    )

    # optional flags
    parser.add_argument(
        "-b",
        "--bams",
        dest="bams",
        required=False,
        help="Enter a comma separated list of additional bam files to map to",
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        required=False,
        help="bam_file to rank enhancer by",
    )
    parser.add_argument(
        "-s",
        "--stitch",
        dest="stitch",
        default="",
        help=(
            "Enter a max linking distance for stitching. Default will determine optimal stitching"
            " parameter"
        ),
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion",
    )

    parser.add_argument(
        "--mask",
        dest="mask",
        required=False,
        help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions",
    )

    # RETRIEVING FLAGS
    args = parser.parse_args()

    # making the out folder if it doesn't exist
    out_folder = utils.format_folder(args.out, True)

    # figuring out folder schema
    gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True)
    mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True)

    # GETTING INPUT FILE
    if args.input.split(".")[-1] == "bed":
        # CONVERTING A BED TO GFF
        input_gff_name = args.input.split("/")[-1][0:-4]
        input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name))
        utils.bed_to_gff(args.input, input_gff_file)
    elif args.input.split(".")[-1] == "gff":
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    else:
        print(
            "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT"
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        input_gff_file = args.input
        copyfile(
            input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file))
        )

    # GETTING THE LIST OF bam_fileS TO PROCESS
    if args.control:
        bam_file_list = [args.rankby, args.control]

    else:
        bam_file_list = [args.rankby]

    if args.bams:
        bam_file_list += args.bams.split(",")
        # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control
        # bam over and over again
    # optional args

    # Stitch parameter
    if args.stitch == "":
        stitch_window = ""
    else:
        stitch_window = int(args.stitch)

    # tss args
    tss_window = int(args.tss)
    if tss_window != 0:
        remove_tss = True
    else:
        remove_tss = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print("USING {} AS THE INPUT GFF".format(input_gff_file))
    input_name = os.path.basename(input_gff_file).split(".")[0]

    # GETTING THE GENOME
    genome = args.genome
    print("USING {} AS THE GENOME".format(genome))

    annot_file = rose2_utils.genome_dict[genome.upper()]

    # GET CHROMS FOUND IN THE BAMS
    print("GETTING CHROMS IN bam_fileS")
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # LOADING IN THE GFF AND FILTERING BY CHROM
    print("LOADING AND FILTERING THE GFF")
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print("LOADING IN GFF REGIONS")
    reference_collection = utils.gff_to_locus_collection(input_gff)
    print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection)))
    print("CHECKING REFERENCE COLLECTION:")
    rose2_utils.check_ref_collection(reference_collection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if args.mask:
        mask_file = args.mask
        print("USING MASK FILE {}".format(mask_file))
        # if it's a bed file
        if mask_file.split(".")[-1].upper() == "BED":
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split(".")[-1].upper() == "GFF":
            mask_gff = utils.parse_table(mask_file, "\t")
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print("LOADING {} MASK REGIONS".format(str(len(mask_collection))))
        # now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus
            for locus in reference_loci
            if len(mask_collection.get_overlap(locus, "both")) == 0
        ]
        print(
            "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format(
                str(len(reference_loci) - len(filtered_loci)), mask_file
            )
        )
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # NOW STITCH REGIONS
    print("STITCHING REGIONS TOGETHER")
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )
    # NOW MAKE A STITCHED COLLECTION GFF
    print("MAKING GFF FROM STITCHED COLLECTION")
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # making sure start/stop ordering are correct
    for i in range(len(stitched_gff)):

        line = stitched_gff[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)),
        )
        stitched_gff_name = "{}_{}KB_STITCHED".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)),
        )
    else:
        stitched_gff_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.gff".format(
                input_name, str(stitch_window // 1000)
            ),
        )
        stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format(
            input_name, str(stitch_window // 1000)
        )
        debug_out_file = os.path.join(
            gff_folder,
            "{}_{}KB_STITCHED_TSS_DISTAL.debug".format(
                input_name, str(stitch_window // 1000)
            ),
        )

    # WRITING DEBUG OUTPUT TO DISK
    if debug:
        print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, "\t")

    # WRITE THE GFF TO DISK
    print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, "\t")

    # SETTING UP THE OVERALL OUTPUT FILE
    output_file1 = os.path.join(
        out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)
    )
    print("OUTPUT WILL BE WRITTEN TO  {}".format(output_file1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out1_folder = os.path.join(
            mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)
        )
        mapped_out1_file = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name),
            "matrix.txt",
        )
        if utils.check_output(mapped_out1_file, 0.2, 0.2):
            print(
                "FOUND {} MAPPING DATA FOR BAM: {}".format(
                    stitched_gff_file, mapped_out1_file
                )
            )
        else:
            cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                stitched_gff_file, mapped_out1_folder, bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out1_file, 0.2, 5):
                print(
                    "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
            else:
                print(
                    "ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                        stitched_gff_file, bam_file_name
                    )
                )
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file1,
        ref_name=stitched_gff_name,
    )

    print("CALLING AND PLOTTING SUPER-ENHANCERS")

    if args.control:
        control_name = os.path.basename(args.control)
    else:
        control_name = "NONE"
    cmd = "Rscript {} {} {} {} {}".format(
        os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"),
        out_folder + "/",  # TODO: fix R script so it does not require '/'
        output_file1,
        input_name,
        control_name,
    )
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    super_table_file = "{}_SuperEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, super_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, super_table_file)
        )
    os.system(cmd)

    stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, stretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, stretch_table_file)
        )
    os.system(cmd)

    superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name)
    if args.control:
        cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format(
            genome,
            args.rankby,
            args.control,
            os.path.join(out_folder, superstretch_table_file),
        )
    else:
        cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format(
            genome, args.rankby, os.path.join(out_folder, superstretch_table_file)
        )
    os.system(cmd)
Beispiel #11
0
def tf_edge_delta_out(
    crc_folder,
    bam_list,
    analysis_name,
    edge_table_path_1,
    edge_table_path_2,
    group1_list,
    group2_list,
    output="",
):
    """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks)."""
    crc_folder = utils.format_folder(crc_folder, True)
    edge_path = merge_edge_tables(
        edge_table_path_1,
        edge_table_path_2,
        os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)),
    )

    # make a gff of the edge table
    edge_table = utils.parse_table(edge_path, "\t")
    edge_gff = []
    for line in edge_table[1:]:
        gff_line = [
            line[2],
            "{}_{}".format(line[0], line[1]),
            "",
            line[3],
            line[4],
            "",
            ".",
            "",
            "{}_{}".format(line[0], line[1]),
        ]
        edge_gff.append(gff_line)

    edge_gff_path = os.path.join(crc_folder,
                                 "{}_EDGE_TABLE.gff".format(analysis_name))
    utils.unparse_table(edge_gff, edge_gff_path, "\t")

    # direct the output to the crc folder
    signal_path = os.path.join(
        crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name))

    all_group_list = group1_list + group2_list
    if not utils.check_output(signal_path, 0, 0):
        signal_table_list = pipeline_utils.map_regions(
            bam_list,
            [edge_gff_path],
            crc_folder,
            crc_folder,
            all_group_list,
            True,
            signal_path,
            extend_reads_to=100,
        )
        print(signal_table_list)
    else:
        print("Found previous signal table at {}".format(signal_path))

    # now bring in the signal table as a dictionary using the locus line as the id
    print("making log2 group1 vs group2 signal table at edges")
    signal_table = utils.parse_table(signal_path, "\t")

    # figure out columns for group1 and group2
    group1_columns = [signal_table[0].index(name) for name in group1_list]
    group2_columns = [signal_table[0].index(name) for name in group2_list]
    group1_signal_vector = []
    group2_signal_vector = []
    for line in signal_table[1:]:
        group1_signal = numpy.mean(
            [float(line[col]) for col in group1_columns])
        group2_signal = numpy.mean(
            [float(line[col]) for col in group2_columns])

        group1_signal_vector.append(group1_signal)
        group2_signal_vector.append(group2_signal)

    group1_median = numpy.median(group1_signal_vector)
    group2_median = numpy.median(group2_signal_vector)

    print("group1 median signal")
    print(group1_median)
    print("group2 median signal")
    print(group2_median)

    # now that we have the median, we can take edges where at least 1 edge is above the median
    # and both are above zero and generate a new table w/ the fold change
    signal_filtered_path = signal_path.replace(".txt", "_filtered.txt")
    if utils.check_output(signal_filtered_path, 0, 0):
        print("Found filtered signal table for edges at {}".format(
            signal_filtered_path))
        signal_table_filtered = utils.parse_table(signal_filtered_path, "\t")
    else:
        signal_table_filtered = [
            signal_table[0] +
            ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"]
        ]
        for line in signal_table[1:]:
            group1_signal = numpy.mean(
                [float(line[col]) for col in group1_columns])
            group2_signal = numpy.mean(
                [float(line[col]) for col in group2_columns])

            if (group1_signal > group1_median or group2_signal > group2_median
                ) and min(group1_signal, group2_signal) > 0:
                delta = numpy.log2(group1_signal / group2_signal)
                new_line = line + [group1_signal, group2_signal, delta]
                signal_table_filtered.append(new_line)

        utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t")

    # now get a list of all TFs in the system
    tf_list = utils.uniquify(
        [line[0].split("_")[0] for line in signal_table_filtered[1:]])
    tf_list.sort()
    print(tf_list)

    out_degree_table = [[
        "TF_NAME",
        "EDGE_COUNT",
        "DELTA_MEAN",
        "DELTA_MEDIAN",
        "DELTA_STD",
        "DELTA_SEM",
    ]]

    for tf_name in tf_list:
        print(tf_name)
        edge_vector = [
            float(line[-1]) for line in signal_table_filtered[1:]
            if line[0].split("_")[0] == tf_name
        ]

        edge_count = len(edge_vector)
        delta_mean = round(numpy.mean(edge_vector), 4)
        delta_median = round(numpy.median(edge_vector), 4)
        delta_std = round(numpy.std(edge_vector), 4)
        delta_sem = round(stats.sem(edge_vector), 4)
        tf_out_line = [
            tf_name,
            edge_count,
            delta_mean,
            delta_median,
            delta_std,
            delta_sem,
        ]
        out_degree_table.append(tf_out_line)

    # set final output
    if not output:
        output_path = os.path.join(
            crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name))
    else:
        output_path = output

    utils.unparse_table(out_degree_table, output_path, "\t")
    print(output_path)
    return output_path
Beispiel #12
0
def main():
    """Main run method for enhancer promoter contribution tool."""
    parser = argparse.ArgumentParser()

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs="*",
        help="Enter a space separated list of .bam files for the main factor",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or .bed file of regions to analyze",
        required=True,
    )
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=(
            "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently "
            "supported"),
        required=True,
    )
    parser.add_argument(
        "-p",
        "--chrom-path",
        dest="chrom_path",
        type=str,
        help=("Provide path to a folder with a seperate fasta file for each "
              "chromosome"),
        required=True,
    )
    # output flag
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        type=str,
        help="Enter the output folder.",
        required=True,
    )

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=("specify a table where first column represents a list of active "
              "refseq genes"),
        required=False,
    )
    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs="*",
        help=("Enter a space separated list of .bam files for background. If "
              "flagged, will perform background subtraction"),
        required=False,
    )
    parser.add_argument(
        "-t",
        "--tss",
        dest="tss",
        type=int,
        help="Define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000,
    )
    parser.add_argument(
        "-d",
        "--distal",
        dest="distal",
        type=int,
        help="Enter a window to assign distal enhancer signal. Default is 50kb",
        required=False,
        default=50000,
    )
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs="*",
        help="enter a space separated list of other bams to map to",
        required=False,
    )
    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        ("enter a root name for the analysis, otherwise will try to find the "
         "name from the input file"),
        required=False,
    )
    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        ("Run the analysis on the top N genes by total signal. Default is 5000"
         ),
        required=False,
        default=5000,
    )
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        ("Include a .bed of tad regions to restrict enhancer/gene association"
         ),
        required=False,
        default=None,
    )
    parser.add_argument(
        "--mask",
        dest="mask",
        default=None,
        help=(
            "Mask a set of regions from analysis.  Provide a .bed or .gff of "
            "masking regions"),
    )

    args = parser.parse_args()

    print(args)

    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===========I. DATA SUMMARY============\n#="
        "=====================================\n")

    # top analysis subset
    top = args.top

    # input genome
    genome = args.genome.upper()
    print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome))

    # set of bams
    bam_file_list = args.bam

    # bring in the input path
    input_path = args.input

    # try to get the input name or use the name argument
    if args.name:
        analysis_name = args.name
    else:
        analysis_name = os.path.basename(input_path).split(".")[0]

    print("USING {} AS ANALYSIS NAME".format(analysis_name))
    # setting up the output folder
    parent_folder = utils.format_folder(args.output, True)
    output_folder = utils.format_folder(
        os.path.join(parent_folder, analysis_name), True)

    print("WRITING OUTPUT TO {}".format(output_folder))

    if input_path.split(".")[-1] == "bed":
        # type is bed
        print("input in bed format, converting to gff")
        input_gff = utils.bed_to_gff(input_path)
    else:
        input_gff = utils.parse_table(input_path, "\t")

    # the tss window for proximal signal assignment
    tss_window = int(args.tss)

    # the distal window for assigning nearby enhancer signal
    distal_window = int(args.distal)

    # activity path
    if args.activity:
        activity_path = args.activity
        activity_table = utils.parse_table(activity_path, "\t")
        ref_col = 0
        # try to find the column for refseq id
        for i in range(len(
                activity_table[2])):  # use an internal row in case of header
            if str(activity_table[1][i]).count("NM_") or str(
                    activity_table[1][i]).count("NR_"):
                ref_col = i

        # now check for header
        if not str(activity_table[0][i]).count("NM_") and not str(
                activity_table[0][i]).count("NR_"):
            print("REMOVING HEADER FROM GENE TABLE:")
            print(activity_table[0])
            activity_table.pop(0)

        gene_list = [line[ref_col] for line in activity_table
                     ]  # this needs to be REFSEQ NM ID
        print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list)))

    else:
        gene_list = []

    # check if tads are being invoked
    if args.tads:
        print("LOADING TAD LOCATIONS FROM {}".format(args.tads))
        tads_path = args.tads
    else:
        tads_path = ""

    print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome))

    genome_dir = args.chrom_path

    # making a chrom_dict that is a list of all chroms with sequence
    chrom_list = utils.uniquify(
        [name.split(".")[0] for name in os.listdir(genome_dir) if name])

    # important here to define the window
    start_dict, tss_collection, mouse_convert_dict = load_annot_file(
        genome,
        tss_window,
        gene_list,
    )

    print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES")

    print(chrom_list)
    filtered_gff = [line for line in input_gff if chrom_list.count(line[0])]

    print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format(
        str(len(filtered_gff)),
        str(len(input_gff)),
    ))

    # =====================================================================================
    # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
    # =====================================================================================

    print(
        "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#="
        "=====================================\n")

    # now we need to split the input region
    print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window))
    split_gff = split_regions(filtered_gff,
                              tss_collection,
                              mask_file=args.mask)
    print(len(filtered_gff))
    print(len(split_gff))

    split_gff_path = os.path.join(output_folder,
                                  "{}_SPLIT.gff".format(analysis_name))
    utils.unparse_table(split_gff, split_gff_path, "\t")
    print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path))

    # now you have to map the bams to the gff
    print("MAPPING TO THE SPLIT GFF")
    mapped_folder = utils.format_folder(
        os.path.join(output_folder, "bam_mapping"), True)

    signal_table = map_bams(bam_file_list, split_gff_path, analysis_name,
                            mapped_folder)
    signal_table_path = os.path.join(
        output_folder, "{}_signal_table.txt".format(analysis_name))
    utils.unparse_table(signal_table, signal_table_path, "\t")

    if args.control:
        control_bam_file_list = args.control
        control_signal_table = map_bams(
            control_bam_file_list,
            split_gff_path,
            analysis_name,
            mapped_folder,
        )
        control_signal_table_path = os.path.join(
            output_folder,
            "{}_control_signal_table.txt".format(analysis_name),
        )
        utils.unparse_table(control_signal_table, control_signal_table_path,
                            "\t")

    # now create the background subtracted summarized average table
    print("CREATING AN AVERAGE SIGNAL TABLE")
    average_table = make_average_table(
        output_folder,
        analysis_name,
        use_background=args.control  # TODO: fix to True or False
    )
    average_table_path = os.path.join(
        output_folder, "{}_average_table.txt".format(analysis_name))
    utils.unparse_table(average_table, average_table_path, "\t")

    # now load up all of the cpg and other parameters to make the actual peak table

    # first check if this has already been done
    peak_table_path = os.path.join(output_folder,
                                   "{}_PEAK_TABLE.txt".format(analysis_name))
    if utils.check_output(peak_table_path, 0.1, 0.1):
        print("PEAK TABLE OUTPUT ALREADY EXISTS")
        peak_table = utils.parse_table(peak_table_path, "\t")
    else:
        peak_table = make_peak_table(
            param_dict,
            split_gff_path,
            average_table_path,
            start_dict,
            gene_list,
            genome_dir,
            tss_window,
            distal_window,
            tads_path,
        )
        utils.unparse_table(peak_table, peak_table_path, "\t")

    gene_table = make_gene_table(peak_table, analysis_name)

    gene_table_path = os.path.join(output_folder,
                                   "{}_GENE_TABLE.txt".format(analysis_name))
    utils.unparse_table(gene_table, gene_table_path, "\t")

    # if mouse, need to convert genes over
    if genome.count("MM") == 1:
        print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA")
        converted_gene_table_path = os.path.join(
            output_folder,
            "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name),
        )

        converted_gene_table = [gene_table[0]]
        for line in gene_table[1:]:
            converted_name = mouse_convert_dict[line[0]]
            if converted_name:
                converted_gene_table.append([converted_name] + line[1:])

                utils.unparse_table(converted_gene_table,
                                    converted_gene_table_path, "\t")

        gene_table_path = converted_gene_table_path
        gene_table = converted_gene_table

    # =====================================================================================
    # ===================================III. PLOTTING ====================================
    # =====================================================================================

    print(
        "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#=="
        "====================================\n")

    # if there are fewer genes in the gene table than the top genes, only run on all
    if len(gene_table) < int(top):
        print(
            "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO"
            "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1),
                                            str(top)))
        top = 0

    # now call the R code
    print("CALLING R PLOTTING SCRIPTS")
    call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
Beispiel #13
0
def map_bams(bam_file_list, split_gff_path, analysis_name, mapped_folder):
    """Map bams to a GFF."""
    print("MAPPING TO THE FOLLOWING BAMS:")

    for bam_file in bam_file_list:
        print(bam_file)
        bam_file_name = os.path.basename(bam_file)

        # MAPPING TO THE STITCHED GFF
        mapped_out_folder = os.path.join(
            mapped_folder,
            "{}_{}_MAPPED".format(analysis_name, bam_file_name),
        )
        mapped_out_file = os.path.join(mapped_out_folder, "matrix.txt")
        if utils.check_output(mapped_out_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                split_gff_path, mapped_out_file))
        else:
            cmd = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                split_gff_path,
                mapped_out_folder,
                bam_file,
            )
            print(cmd)

            os.system(cmd)
            if utils.check_output(mapped_out_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format(
                    split_gff_path,
                    bam_file_name,
                ))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}".format(
                    split_gff_path,
                    bam_file_name,
                ))
                sys.exit()

    print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS")

    # now we make a signal table
    # set up the table using the first bam
    if len(bam_file_list) > 1:

        # set up the first pass at the table
        signal_table = [
            ["REGION_ID", "locusLine"] +
            [name.split("/")[-1] for name in bam_file_list],
        ]
        bam_file_name = bam_file_list[0].split("/")[-1]
        mapped_table = utils.parse_table(
            os.path.join(
                mapped_folder,
                "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                "matrix.txt",
            ),
            "\t",
        )
        signal_table = mapped_table[1:]

        for bam_file in bam_file_list[1:]:
            bam_file_name = bam_file.split("/")[-1]

            mapped_table = utils.parse_table(
                os.path.join(
                    mapped_folder,
                    "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                    "matrix.txt",
                ),
                "\t",
            )

            for i in range(1, len(mapped_table)):
                map_signal = mapped_table[i][2]
                signal_table[i].append(map_signal)
    else:
        bam_file_name = bam_file_list[0].split("/")[-1]
        signal_table = utils.parse_table(
            os.path.join(
                mapped_folder,
                "{}_{}_MAPPED".format(analysis_name, bam_file_name),
                "matrix.txt",
            ),
            "\t",
        )

    return signal_table
def main():
    """Main run function."""
    parser = argparse.ArgumentParser()
    # required flags
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        required=True,
        help="Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project",
    )
    parser.add_argument(
        "-d",
        "--data",
        dest="data",
        required=True,
        help="Enter the data file for the project",
    )
    parser.add_argument(
        "-r",
        "--rose",
        dest="rose",
        required=True,
        help="Enter a comma separated list of rose folder",
    )
    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        required=True,
        help="Enter the output folder for the project",
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        required=True,
        help="Enter a comma separated list of names to go with the datasets",
    )

    # additional args
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        action="store_true",
        default=False,
        help="If flagged, will plot differential regions",
    )
    parser.add_argument(
        "-a",
        "--all",
        dest="all",
        action="store_true",
        default=False,
        help=
        "If flagged, will run analysis for all enhancers and not just supers.",
    )
    parser.add_argument(
        "-m",
        "--median",
        dest="median",
        action="store_true",
        default=False,
        help="If flagged, will use median enhancer scaling",
    )
    parser.add_argument(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        default="super",
        help=
        "specify type of enhancer to analyze: super, stretch, superStretch",
    )

    args = parser.parse_args()

    print(args)

    genome = args.genome.upper()
    data_file = args.data

    rose_folder_string = args.rose
    rose_folder1, rose_folder2 = rose_folder_string.split(",")
    parent_folder = utils.format_folder(args.output, True)

    name_string = args.names
    name1, name2 = name_string.split(",")

    merge_name = "{}_{}_merged".format(name1, name2)

    # option for median scaling
    median_scale = args.median

    plot_bam = args.plot
    if args.all:
        super_only = False
    else:
        super_only = True

    if super_only and plot_bam:
        print(
            "Running dynamic enhancer analysis on all super enhancers in {} and {} and plotting "
            "output to {}".format(name1, name2, parent_folder))
    if super_only and not plot_bam:
        print(
            "Running dynamic enhancer analysis on all super enhancers in {} and {} and writing "
            "output to {}".format(name1, name2, parent_folder))
    if not super_only and plot_bam:
        print(
            "Running dynamic enhancer analysis on all enhancers in {} and {} and plotting output "
            "to {}. WARNING: Plotting all differential enhancers could take a while"
            .format(name1, name2, parent_folder))
    if not super_only and not plot_bam:
        print(
            "Running dynamic enhancer analysis on all enhancers in {} and {} and writing output "
            "to {}.".format(name1, name2, parent_folder))

    # part 1
    print("PART1: analyzing ROSE output from {} and {}".format(name1, name2))
    # start with the all enhancer tables from the initial rose calls

    rose_folder1 = utils.format_folder(rose_folder1, False)
    rose_folder2 = utils.format_folder(rose_folder2, False)

    rose_dict1 = make_rose_dict(rose_folder1)
    rose_dict2 = make_rose_dict(rose_folder2)

    # choosing the type of enhancer to analyze
    enhancer_call_type = args.enhancer_type.lower()
    if super_only:
        print("ANALYZING ENHANCER TYPE: {}".format(enhancer_call_type.upper()))

    super_file1 = rose_dict1[enhancer_call_type]
    super_file2 = rose_dict2[enhancer_call_type]

    all_file1 = rose_dict1["AllEnhancer"]
    all_file2 = rose_dict2["AllEnhancer"]

    print("\tMERGING ENHANCERS AND CALLING ROSE")
    if super_only:
        if len(super_file1) == 0:
            print("ERROR: UNABLE TO FIND {} FILES IN {}".format(
                enhancer_call_type, rose_folder1))
            sys.exit()
        if len(super_file2) == 0:
            print("ERROR: UNABLE TO FIND {} FILES IN {}".format(
                enhancer_call_type, rose_folder2))
            sys.exit()
        rose_output = call_merge_supers(
            data_file,
            super_file1,
            super_file2,
            name1,
            name2,
            merge_name,
            genome,
            parent_folder,
        )

    else:
        rose_output = call_merge_supers(
            data_file,
            all_file1,
            all_file2,
            name1,
            name2,
            merge_name,
            genome,
            parent_folder,
        )

    print("\tCALCULATING ENHANCER DELTA AND MAKING PLOTS")

    # part2 is the R script
    merged_gff_file = os.path.join(
        parent_folder,
        "{}_{}_MERGED_REGIONS_-0_+0.gff".format(genome, merge_name))
    rcmd = call_delta_r_script(
        merged_gff_file,
        parent_folder,
        data_file,
        name1,
        name2,
        all_file1,
        all_file2,
        median_scale,
    )
    print(rcmd)
    os.system(rcmd)

    time.sleep(30)
    call_rose_gene_mapper(merged_gff_file, genome, parent_folder, name1)

    # rank the genes

    # part 3
    # rank the delta
    print("PART 3: assinging ranks to differential enhancers")
    print("\tASSIGNING SUPER RANK TO MERGED ENHANCERS")

    gff_name = "{}_{}_MERGED_REGIONS_-0_+0".format(genome, merge_name)
    enhancer_to_gene_file = os.path.join(
        parent_folder,
        "{}_ROSE".format(name1),
        "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt".format(
            gff_name),
    )
    if utils.check_output(enhancer_to_gene_file):
        rank_output = os.path.join(
            parent_folder,
            "{}_ROSE".format(name1),
            "{}_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt".
            format(gff_name),
        )
        assign_enhancer_rank(enhancer_to_gene_file, all_file1, all_file2,
                             name1, name2, rank_output)
    else:
        print("ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN")
        sys.exit()

    # make the rank plot
    print("MAKING RANK PLOTS")
    if utils.check_output(rank_output):
        rcmd = call_rank_r_script(rank_output, name1, name2, super_file1,
                                  super_file2)
        print(rcmd)
        os.system(rcmd)
    else:
        print("ERROR: RANK PLOT SCRIPT FAILED TO RUN")
        sys.exit()

    time.sleep(30)

    print("FINISHING OUTPUT")
    finish_rank_output(
        data_file,
        rank_output,
        genome,
        parent_folder,
        merge_name,
        name1,
        name2,
        1,
        100000,
        super_only,
        plot_bam,
    )