def call_merge_supers(data_file, super_file1, super_file2, name1, name2, merge_name, genome, parent_folder): """Call ROSE2 on merged super enhancers.""" merged_gff_file = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % ( parent_folder, genome.upper(), merge_name, ) # check to make sure this hasn't been done yet rose_output = os.path.join( parent_folder, "{}_ROSE".format(name1), "{}_{}_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt". format(genome.upper(), merge_name), ) try: utils.parse_table(rose_output, "\t") print("ROSE OUTPUT ALREADY FOUND HERE {}".format(rose_output)) return rose_output except (FileNotFoundError, IOError): print("MERGING ENHANCER REGIONS FROM {} and {}".format( super_file1, super_file2)) merged_gff = merge_collections(super_file1, super_file2, name1, name2, merged_gff_file) # call rose on the merged collection rose_bash_file = call_rose_merged(data_file, merged_gff, name1, name2, parent_folder) print(rose_bash_file) # run the bash command os.system("bash {}".format(rose_bash_file)) # check for and return output if utils.check_output(rose_output, 1, 10): return rose_output else: # try finding it w/ a different name # this will bug out if nothing is there rose_folder = os.path.join(parent_folder, "{}_ROSE".format(name1)) rose_file_list = [ x for x in os.listdir(rose_folder) if x[0] != "." ] # no hidden files if not rose_file_list: print("No files found in {}".format(rose_folder)) sys.exit() pipeline_utils.get_file("_SuperEnhancers_ENHANCER_TO_GENE.txt", rose_file_list, rose_folder)
def make_bed_collection(bed_file_list): """Takes in a list of bed files and makes a single huge collection. Each locus has as its ID the name of the bed file. """ bed_loci = [] print("MAKING BED COLLECTION FOR:") for bed_file in bed_file_list: bed_name = os.path.basename(bed_file).split(".")[0] print(bed_name) bed = utils.parse_table(bed_file, "\t") for line in bed: if len(line) >= 3: # check that line[0] if line[0][0:3] == "chr": try: coords = [int(line[1]), int(line[2])] bed_locus = utils.Locus( line[0], min(coords), max(coords), ".", bed_name ) bed_loci.append(bed_locus) except ValueError: pass print("IDENTIFIED {} BED REGIONS".format(str(len(bed_loci)))) return utils.LocusCollection(bed_loci, 50)
def collapse_region_map(region_map_file, name="", control_bams=False): """Take a region_map file and collapse signal into a single column. Also fix any stupid start/stop sorting issues. Need to take into account whether or not controls were used. """ region_map = utils.parse_table(region_map_file, "\t") for n, line in enumerate(region_map): if n == 0: # new header if len(name) == 0: name = "MERGED_SIGNAL" region_map[n] = line[0:6] + [name] else: new_line = list(line[0:6]) if control_bams: signal_line = [float(x) for x in line[6:]] rankby_indexes = range(0, len(signal_line) // 2, 1) control_indexes = range( len(signal_line) // 2, len(signal_line), 1) meta_vector = [] for i, j in zip(rankby_indexes, control_indexes): # min signal is 0 meta_vector.append(max(0, signal_line[i] - signal_line[j])) meta_signal = numpy.mean(meta_vector) else: meta_signal = numpy.mean([float(x) for x in line[6:]]) region_map[n] = new_line + [meta_signal] output_file = region_map_file.replace("REGION", "META") utils.unparse_table(region_map, output_file, "\t") return output_file
def make_enhancer_signal_table(name_dict, merged_region_map, median_dict, analysis_name, genome, output_folder): """Makes a signal table. Each row is an enhancer and each column is the log2 background corrected signal vs. median. """ # load in the region map region_map = utils.parse_table(merged_region_map, "\t") names_list = list(name_dict.keys()) names_list.sort() signal_table = [[ "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE" ] + names_list] print("len of {} for names_list".format(len(names_list))) print(names_list) for line in region_map[1:]: new_line = line[0:6] # a little tricky here to add datasets sequentially i = 6 # start w/ the first column w/ data for name in names_list: if name_dict[name]["background"] is True: enhancer_index = int(i) i += 1 control_index = int(i) i += 1 try: enhancer_signal = float(line[enhancer_index]) - float( line[control_index]) except IndexError: print(line) print(len(line)) print(enhancer_index) print(control_index) sys.exit() else: enhancer_index = int(i) i += 1 enhancer_signal = float(line[enhancer_index]) if enhancer_signal < 0: enhancer_signal = 0 enhancer_signal = enhancer_signal / median_dict[name] new_line.append(enhancer_signal) signal_table.append(new_line) output_file = os.path.join( output_folder, "{}_{}_signal_table.txt".format(genome, analysis_name)) print("WRITING MEDIAN NORMALIZED SIGNAL TABLE TO {}".format(output_file)) utils.unparse_table(signal_table, output_file, "\t") return output_file
def make_signal_table( names_list, gff_file, mapped_folder, median_norm=False, output="" ): """For each sample, make a dictionary keyed by locus ID.""" signal_dict = {} for name in names_list: signal_dict[name] = defaultdict(float) # now start filling in the signal dict gff_name = os.path.basename(gff_file).split(".")[0] print(gff_name) for name in names_list: print("MAKING SIGNAL DICT FOR %s" % (name)) # try opening the batch mapping output first mapped_file = os.path.join( mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name) ) if utils.check_output(mapped_file, 0.02, 0.02): print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file)) else: mapped_file = os.path.join( mapped_folder, gff_name, "{}_{}.txt".format(gff_name, name), ) if utils.check_output(mapped_file, 0.02, 0.02): print("FOUND MAPPED FILE FOR {} AT {}".format(name, mapped_file)) else: print("ERROR NO MAPPED FILE FOUND FOR {}".format(name)) sys.exit() mapped_table = utils.parse_table(mapped_file, "\t") if median_norm: median_signal = numpy.median([float(line[2]) for line in mapped_table[1:]]) else: median_signal = 1 for line in mapped_table[1:]: signal_dict[name][line[1]] = float(line[2]) / median_signal # now make the signal table signal_table = [] header = ["GENE_ID", "locusLine"] + names_list signal_table.append(header) for line in mapped_table[1:]: locus_id = line[1] sig_line = line[0:2] + [signal_dict[name][locus_id] for name in names_list] signal_table.append(sig_line) if not output: return signal_table else: utils.unparse_table(signal_table, output, "\t") return signal_table
def format_data_table(data_file): """Formats the data_file and rewrite. First 3 columns are required for every line. If they aren't there the line is deleted. """ print("reformatting data table") data_table = utils.parse_table(data_file, "\t") new_data_table = [ [ "FILE_PATH", "UNIQUE_ID", "GENOME", "NAME", "BACKGROUND", "ENRICHED_REGION", "ENRICHED_MACS", "COLOR", "FASTQ_FILE", ] ] # first check to make sure the table is formatted correctly for line in data_table[1:]: if len(line) < 3: continue # this spots header lines that may be out of place if line[0] == "FILE_PATH": continue # check if it at least has the first 3 columns filled in if len(line[0]) == 0 or len(line[1]) == 0 or len(line[2]) == 0: print("ERROR required fields missing in line") print(line) # if the first three are filled in, check to make sure there are 8 columns else: if len(line) > 3 and len(line) < 9: new_line = line + (8 - len(line)) * [""] + ["NA"] new_data_table.append(new_line) elif len(line) >= 9: new_line = line[0:9] new_data_table.append(new_line) # lower case all of the genomes # make the color 0,0,0 for blank lines and strip out any " marks for i in range(1, len(new_data_table)): new_data_table[i][2] = new_data_table[i][2].lower() color = new_data_table[i][7] if len(color) == 0: new_data_table[i][7] = "0,0,0" utils.unparse_table(new_data_table, data_file, "\t") return new_data_table
def assign_enhancer_rank(enhancer_to_gene_file, enhancer_file1, enhancer_file2, name1, name2, rank_output=""): """Assign enhancer rank to genes. For all genes in the enhancer_to_gene table, assign the highest overlapping ranked enhancer in the other tables. """ enhancer_to_gene = utils.parse_table(enhancer_to_gene_file, "\t") enhancer_collection1 = make_se_collection(enhancer_file1, name1, False) enhancer_collection2 = make_se_collection(enhancer_file2, name2, False) enhancer_dict1 = make_se_dict(enhancer_file1, name1, False) enhancer_dict2 = make_se_dict(enhancer_file2, name2, False) # we're going to update the enhancer_to_gene_table enhancer_to_gene[0] += ["{}_rank".format(name1), "{}_rank".format(name2)] for i in range(1, len(enhancer_to_gene)): line = enhancer_to_gene[i] locus_line = utils.Locus(line[1], line[2], line[3], ".", line[0]) # if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1_overlap = enhancer_collection1.get_overlap( locus_line, "both") if len(enhancer1_overlap) == 0: enhancer1_rank = len(enhancer_collection1) else: rank_list1 = [ enhancer_dict1[x.id]["rank"] for x in enhancer1_overlap ] enhancer1_rank = min(rank_list1) enhancer2_overlap = enhancer_collection2.get_overlap( locus_line, "both") if len(enhancer2_overlap) == 0: enhancer2_rank = len(enhancer_collection2) else: rank_list2 = [ enhancer_dict2[x.id]["rank"] for x in enhancer2_overlap ] enhancer2_rank = min(rank_list2) enhancer_to_gene[i] += [enhancer1_rank, enhancer2_rank] if len(rank_output) == 0: return enhancer_to_gene else: utils.unparse_table(enhancer_to_gene, rank_output, "\t")
def make_se_collection(enhancer_file, name, super_only=True): """Return a locus collection from a super table.""" enhancer_table = utils.parse_table(enhancer_file, "\t") enhancer_loci = [] for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: if super_only and int(line[-1]) == 0: break enhancer_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) return utils.LocusCollection(enhancer_loci, 50)
def get_median_signal(enhancer_file, name, data_file): """Return the median enhancer signal of a file.""" data_dict = pipeline_utils.load_data_table(data_file) enhancer_table = utils.parse_table(enhancer_file, "\t") background_name = data_dict[name]["background"] if background_name in data_dict: enhancer_vector = [ float(line[6]) - float(line[7]) for line in enhancer_table[6:] ] else: enhancer_vector = [float(line[6]) for line in enhancer_table[6:]] median = numpy.median(enhancer_vector) return median
def filter_gff(gff_file, chrom_list): """Take in a gff and filter out all lines that don't belong to a chrom in the chrom_list.""" gff = utils.parse_table(gff_file, "\t") filtered_gff = [] exclude_list = [] for line in gff: if chrom_list.count(line[0]) == 1: filtered_gff.append(line) else: exclude_list.append(line[0]) exclude_list = utils.uniquify(exclude_list) if len(exclude_list) > 0: print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: {}".format( ",".join(exclude_list))) return filtered_gff
def make_median_dict(name_dict): """For each dataset returns the median background subtracted enhancer signal.""" median_dict = {} for name in name_dict: # open up the allenhancer_table enhancer_table = utils.parse_table(name_dict[name]["enhancer_file"], "\t") if name_dict[name]["background"] is True: # assume header ends after line 5 enhancer_vector = [ float(line[6]) - float(line[7]) for line in enhancer_table[6:] ] else: enhancer_vector = [float(line[6]) for line in enhancer_table[6:]] median_dict[name] = numpy.median(enhancer_vector) return median_dict
def load_annot_file(genome, tss_window, gene_list=[]): """Load in the annotation. Create a start_dict and tss collection for a set of refseq IDs for a given genome. """ annotation_folder = os.path.join(ROOT_DIR, "annotation") genome_dict = { "HG18": os.path.join(annotation_folder, "hg18_refseq.ucsc"), "MM9": os.path.join(annotation_folder, "mm9_refseq.ucsc"), "MM10": os.path.join(annotation_folder, "mm10_refseq.ucsc"), "HG19": os.path.join(annotation_folder, "hg19_refseq.ucsc"), "HG19_RIBO": os.path.join(annotation_folder, "hg19_refseq.ucsc"), "RN4": os.path.join(annotation_folder, "rn4_refseq.ucsc"), "RN6": os.path.join(annotation_folder, "rn6_refseq.ucsc"), "HG38": os.path.join(annotation_folder, "hg38_refseq.ucsc"), } mouse_convert_file = os.path.join(annotation_folder, "HMD_HumanPhenotype.rpt") # making a dictionary for mouse to human conversion mouse_convert_dict = defaultdict(str) mouse_convert_table = utils.parse_table(mouse_convert_file, "\t") for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] annot_file = genome_dict[genome.upper()] start_dict = utils.make_start_dict(annot_file, gene_list) tss_loci = [] if not gene_list: gene_list = [*start_dict] for gene in gene_list: tss_loci.append( utils.make_tss_locus(gene, start_dict, tss_window, tss_window)) tss_collection = utils.LocusCollection(tss_loci, 50) return start_dict, tss_collection, mouse_convert_dict
def make_se_dict(enhancer_file, name, super_only=True): """Make an attribute dict for enhancers keyed by uniqueID.""" se_dict = {} enhancer_table = utils.parse_table(enhancer_file, "\t") for line in enhancer_table: if line[0][0] == "#": continue if line[0][0] == "R": header = line sup_column = header.index("isSuper") continue if super_only: if int(line[sup_column]) == 1: rank = int(line[-2]) enhancer_id = "{}_{}".format(name, line[0]) se_dict[enhancer_id] = {"rank": rank} else: rank = int(line[-2]) enhancer_id = "{}_{}".format(name, line[0]) se_dict[enhancer_id] = {"rank": rank} return se_dict
def make_se_collection(enhancer_file, name, top=0): """Return a locus collection from a super table. Top gives the number of rows. """ enhancer_table = utils.parse_table(enhancer_file, "\t") super_loci = [] ticker = 0 for line in enhancer_table: if line[0][0] == "#" or line[0][0] == "R": continue else: ticker += 1 super_loci.append( utils.Locus(line[1], line[2], line[3], ".", "{}_{}".format(name, line[0]))) if ticker == top: break return utils.LocusCollection(super_loci, 50)
def load_data_table(data_file): """Load the master data table.""" if isinstance(data_file, str): data_table = utils.parse_table(data_file, "\t") else: data_table = list(data_file) # first check to make sure the table is formatted correctly for line in data_table: # print(line) if len(line) != 9: print("this line did not pass") print(line) data_table = format_data_table(data_file) break data_dict = defaultdict(dict) for line in data_table[1:]: data_dict[line[3]]["folder"] = utils.format_folder(line[0], False) data_dict[line[3]]["uniqueID"] = line[1] data_dict[line[3]]["genome"] = line[2].upper() genome = line[2] data_dict[line[3]]["sam"] = "".join([line[0], line[1], ".", genome, ".bwt.sam"]) data_dict[line[3]]["ylf"] = "".join([line[0], line[1], ".", genome, ".bwt.ylf"]) data_dict[line[3]]["enriched"] = line[5] data_dict[line[3]]["background"] = line[4] data_dict[line[3]]["enrichedMacs"] = line[6] color_string = line[7].replace('"', "") data_dict[line[3]]["color"] = color_string data_dict[line[3]]["fastq"] = line[8] # figure out which bam convention we are using # default will be new convention # look in the bam_folder for all bams that might fit the bill bam_folder = str(line[0]) bam_file_list = [ x for x in os.listdir(bam_folder) if len(x) > 0 and x[0] != "." ] bam_file_candidates = [ x for x in bam_file_list if x.count(line[1]) == 1 and x.split(".")[-1] == "bam" and x.count("bai") == 0 ] if not bam_file_candidates: print( "UNABLE TO FIND A BAM FILE IN {} WITH UNIQUE ID {}".format( bam_folder, line[1] ) ) full_bam_path = "" elif len(bam_file_candidates) > 1: print( "MUTLIPLE BAM FILES IN {} WITH UNIQUE ID {}. NO BAM ASISGNED".format( bam_folder, line[1] ) ) print(bam_file_candidates) full_bam_path = "" else: bam_file = bam_file_candidates[0] full_bam_path = os.path.abspath(os.path.join(bam_folder, bam_file)) full_bai_path = full_bam_path + ".bai" if full_bam_path: try: open(full_bam_path, "r").close() except (IOError, FileNotFoundError): print("ERROR: BAM FILE {} DOES NOT EXIST".format(full_bam_path)) full_bam_path = "" try: open(full_bai_path, "r").close() except (IOError, FileNotFoundError): print( "ERROR: BAM FILE {} DOES NOT HAVE BAI INDEX".format(full_bam_path) ) full_bam_path = "" data_dict[line[3]]["bam"] = full_bam_path return data_dict
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help="Enter a .gff or .bed file of binding sites used to make enhancers", ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="bam_file to rank enhancer by", ) parser.add_argument( "-o", "--out", dest="out", required=True, help="Enter an output folder" ) parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-b", "--bams", dest="bams", required=False, help="Enter a comma separated list of additional bam files to map to", ) parser.add_argument( "-c", "--control", dest="control", required=False, help="bam_file to rank enhancer by", ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help=( "Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter" ), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mapped_gff"), True) # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = args.input.split("/")[-1][0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)) ) # GETTING THE LIST OF bam_fileS TO PROCESS if args.control: bam_file_list = [args.rankby, args.control] else: bam_file_list = [args.rankby] if args.bams: bam_file_list += args.bams.split(",") # bam_file_list = utils.uniquify(bam_file_list) # makes sad when you have the same control # bam over and over again # optional args # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print("USING {} AS THE INPUT GFF".format(input_gff_file)) input_name = os.path.basename(input_gff_file).split(".")[0] # GETTING THE GENOME genome = args.genome print("USING {} AS THE GENOME".format(genome)) annot_file = rose2_utils.genome_dict[genome.upper()] # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("STARTING WITH {} INPUT REGIONS".format(len(reference_collection))) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(str(len(mask_collection)))) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print( "FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( str(len(reference_loci) - len(filtered_loci)), mask_file ) ) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) # making sure start/stop ordering are correct for i in range(len(stitched_gff)): line = stitched_gff[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000) ), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000) ) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000) ), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name) ) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name) ) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print( "FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file ) ) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print( "SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) else: print( "ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name ) ) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("CALLING AND PLOTTING SUPER-ENHANCERS") if args.control: control_name = os.path.basename(args.control) else: control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' output_file1, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, super_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, super_table_file) ) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, stretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, stretch_table_file) ) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format(input_name) if args.control: cmd = "ROSE2_geneMapper -g {} -r {} -c {} -i {}".format( genome, args.rankby, args.control, os.path.join(out_folder, superstretch_table_file), ) else: cmd = "ROSE2_geneMapper -g {} -r {} -i {}".format( genome, args.rankby, os.path.join(out_folder, superstretch_table_file) ) os.system(cmd)
def map_bams(bam_file_list, split_gff_path, analysis_name, mapped_folder): """Map bams to a GFF.""" print("MAPPING TO THE FOLLOWING BAMS:") for bam_file in bam_file_list: print(bam_file) bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(analysis_name, bam_file_name), ) mapped_out_file = os.path.join(mapped_out_folder, "matrix.txt") if utils.check_output(mapped_out_file, 0.2, 0.2): print("FOUND {} MAPPING DATA FOR BAM: {}".format( split_gff_path, mapped_out_file)) else: cmd = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( split_gff_path, mapped_out_folder, bam_file, ) print(cmd) os.system(cmd) if utils.check_output(mapped_out_file, 0.2, 5): print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( split_gff_path, bam_file_name, )) else: print("ERROR: FAILED TO MAP {} FROM BAM: {}".format( split_gff_path, bam_file_name, )) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # now we make a signal table # set up the table using the first bam if len(bam_file_list) > 1: # set up the first pass at the table signal_table = [ ["REGION_ID", "locusLine"] + [name.split("/")[-1] for name in bam_file_list], ] bam_file_name = bam_file_list[0].split("/")[-1] mapped_table = utils.parse_table( os.path.join( mapped_folder, "{}_{}_MAPPED".format(analysis_name, bam_file_name), "matrix.txt", ), "\t", ) signal_table = mapped_table[1:] for bam_file in bam_file_list[1:]: bam_file_name = bam_file.split("/")[-1] mapped_table = utils.parse_table( os.path.join( mapped_folder, "{}_{}_MAPPED".format(analysis_name, bam_file_name), "matrix.txt", ), "\t", ) for i in range(1, len(mapped_table)): map_signal = mapped_table[i][2] signal_table[i].append(map_signal) else: bam_file_name = bam_file_list[0].split("/")[-1] signal_table = utils.parse_table( os.path.join( mapped_folder, "{}_{}_MAPPED".format(analysis_name, bam_file_name), "matrix.txt", ), "\t", ) return signal_table
def finish_rank_output( data_file, rank_output, genome, merge_folder, merge_name, name1, name2, cut_off=1.5, window=100000, super_only=True, plot_bam=True, ): """Finish rank output. Clean up the rank output table. Make a gff of all of the gained/lost supers beyond a certain cut_off w/ a window. Make a list of gained genes and lost genes. Make a bed of gained loss. """ data_dict = pipeline_utils.load_data_table(data_file) # making sure window and cut_off are int/float cut_off = float(cut_off) window = int(window) genome = genome.upper() # make the output folder output_folder = utils.format_folder(os.path.join(merge_folder, "output"), True) # bring in the old rank table rank_enhancer_table = utils.parse_table(rank_output, "\t") # make a new formatted table header = rank_enhancer_table[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formatted_rank_table = [header] # the gffs gained_gff = [] lost_gff = [] gained_window_gff = [] lost_window_gff = [] if super_only: enhancer_type = "SUPERS" else: enhancer_type = "ENHANCERS" # the beds if super_only: gained_track_header = ( 'track name="{} {} only SEs" description="{} super enhancers that are found only in ' '{} vs {}" itemRGB=On color=255,0,0'.format( genome, name2, genome, name2, name1)) gained_bed = [[gained_track_header]] conserved_track_header = ( 'track name="{} {} and {} SEs" description="{} super enhancers that are found in both' ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2, genome, name1, name2)) conserved_bed = [[conserved_track_header]] lost_track_header = ( 'track name="{} {} only SEs" description="{} super enhancers that are found only in ' '{} vs {}" itemRGB=On color=0,255,0'.format( genome, name1, genome, name1, name2)) lost_bed = [[lost_track_header]] else: gained_track_header = ( 'track name="{} {} only enhancers" description="{} enhancers that are found only in ' '{} vs {}" itemRGB=On color=255,0,0'.format( genome, name2, genome, name2, name1)) gained_bed = [[gained_track_header]] conserved_track_header = ( 'track name="{} {} and {} enhancers" description="{} enhancers that are found in both' ' {} vs {}" itemRGB=On color=0,0,0'.format(genome, name1, name2, genome, name1, name2)) conserved_bed = [[conserved_track_header]] lost_track_header = ( 'track name="{} {} only enhancers" description="{} enhancers that are found only in ' '{} vs {}" itemRGB=On color=0,255,0'.format( genome, name1, genome, name1, name2)) lost_bed = [[lost_track_header]] # the genes gene_table = [[ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ]] for line in rank_enhancer_table[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formatted_rank_table.append(line) # getting the genes gene_list = [] gene_list += line[9].split(",") gene_list += line[10].split(",") gene_list += line[11].split(",") gene_list = [x for x in gene_list if len(x) > 0] gene_list = utils.uniquify(gene_list) gene_string = ",".join(gene_list) bed_line = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cut_off: gff_line = [ line[1], line[0], "", line[2], line[3], "", ".", "", gene_string, ] gff_window_line = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", gene_string, ] gained_gff.append(gff_line) gained_window_gff.append(gff_window_line) gene_status = name2 gained_bed.append(bed_line) # for lost elif float(line[6]) < (-1 * cut_off): gff_line = [ line[1], line[0], "", line[2], line[3], "", ".", "", gene_string, ] gff_window_line = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", gene_string, ] lost_gff.append(gff_line) lost_window_gff.append(gff_window_line) gene_status = name1 lost_bed.append(bed_line) # for conserved else: gene_status = "CONSERVED" conserved_bed.append(bed_line) # now fill in the gene Table for gene in gene_list: gene_table_line = [ gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], gene_status, ] gene_table.append(gene_table_line) # concat the bed full_bed = gained_bed + conserved_bed + lost_bed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formatted_filename = os.path.join( output_folder, "{}_{}_MERGED_{}_RANK_TABLE.txt".format(genome, merge_name, enhancer_type), ) utils.unparse_table(formatted_rank_table, formatted_filename, "\t") # gffs gff_folder = utils.format_folder(output_folder + "gff/", True) gff_filename_gained = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name2.upper(), enhancer_type), ) gff_filename_window_gained = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format( genome, merge_name, name2.upper(), enhancer_type, str(window // 1000), str(window // 1000), ), ) gff_filename_lost = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-0_+0.gff".format(genome, merge_name, name1.upper(), enhancer_type), ) gff_filename_window_lost = os.path.join( gff_folder, "{}_{}_{}_ONLY_{}_-{}KB_+{}KB.gff".format( genome, merge_name, name1.upper(), enhancer_type, str(window // 1000), str(window // 1000), ), ) utils.unparse_table(gained_gff, gff_filename_gained, "\t") utils.unparse_table(gained_window_gff, gff_filename_window_gained, "\t") utils.unparse_table(lost_gff, gff_filename_lost, "\t") utils.unparse_table(lost_window_gff, gff_filename_window_lost, "\t") # bed bed_filename = os.path.join( output_folder, "{}_{}_MERGED_{}.bed".format(genome, merge_name, enhancer_type)) utils.unparse_table(full_bed, bed_filename, "\t") # gene_table gene_filename = os.path.join( output_folder, "{}_{}_MERGED_{}_GENE_TABLE.txt".format(genome, merge_name, enhancer_type), ) utils.unparse_table(gene_table, gene_filename, "\t") # finally, move all of the plots to the output folder copyfile( glob.glob(os.path.join(merge_folder, "{}_ROSE".format(name1), "*.pdf"))[0], os.path.join( output_folder, "{}_{}_MERGED_{}_DELTA.pdf".format(genome, merge_name, enhancer_type), ), ) copyfile( glob.glob( os.path.join(merge_folder, "{}_ROSE".format(name1), "*RANK_PLOT.png"))[0], os.path.join( output_folder, "{}_{}_MERGED_{}_RANK_PLOT.png".format(genome, merge_name, enhancer_type), ), ) # now execute the bamPlot_turbo commands if plot_bam: bam1 = data_dict[name1]["bam"] bam2 = data_dict[name2]["bam"] bam_string = "{} {}".format(bam1, bam2) name_string = "{} {}".format(name1, name2) color_string = "0,0,0:100,100,100" if len(gained_gff) > 0: # gained command plot_title = "{}_ONLY_SE".format(name2) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_gained, output_folder, name_string, color_string, plot_title, )) os.system(cmd) # gained window command plot_title = "{}_ONLY_SE_{}KB_WINDOW".format( name2, str(window // 1000)) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_window_gained, output_folder, name_string, color_string, plot_title, )) os.system(cmd) if len(lost_gff) > 0: # lost command plot_title = "{}_ONLY_SE".format(name1) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_lost, output_folder, name_string, color_string, plot_title, )) os.system(cmd) # lost command plot_title = "{}_ONLY_SE_{}KB_WINDOW".format( name1, str(window // 1000)) cmd = ( "bamPlot_turbo -g {} -b {} -i {} -o {} -n {} -c {} -t {} -r -y UNIFORM -p " "MULTIPLE".format( genome, bam_string, gff_filename_window_lost, output_folder, name_string, color_string, plot_title, )) os.system(cmd) return
def make_average_table(output_folder, analysis_name, use_background=False): """Makes a signal table that is the average background subtracted signal for each region. If background is present, will zero out regions before trying to take average. i.e. no negative regions allowed. """ signal_table_path = os.path.join( output_folder, "{}_signal_table.txt".format(analysis_name), ) signal_table = utils.parse_table(signal_table_path, "\t") average_table = [[ "GENE_ID", "locusLine", "{}_signal".format(analysis_name) ]] # first the easy case with no background if not use_background: for line in signal_table[1:]: new_line = line[0:2] avg_signal = round(numpy.mean([float(x) for x in line[2:]]), 4) new_line.append(avg_signal) average_table.append(new_line) # now the condition w/ background else: control_table_path = os.path.join( output_folder, "{}_control_signal_table.txt".format(analysis_name), ) control_table = utils.parse_table(control_table_path, "\t") # checking to make sure the # of backgrounds = number of signal bams # otherwise throw an error signal_n_col = len(signal_table[0]) control_n_col = len(control_table[0]) if signal_n_col != control_n_col: print("ERROR: MUST PROVIDE SAME NUMBER OF CONTROL BAMS") sys.exit() signal_n_rows = len(signal_table) control_n_rows = len(control_table) if signal_n_rows != control_n_rows: print("ERROR: MAPPED FILES ARE NOT THE SAME LENGTH") sys.exit() for i in range(1, len(signal_table)): signal_line = signal_table[i] control_line = control_table[i] if signal_line[0:2] != control_line[0:2]: print("ERROR: REGIONS ON LINE {} DO NOT CORRESPOND".format( str(i))) sys.exit() new_line = signal_line[0:2] signal_values = [float(x) for x in signal_line[2:]] control_values = [float(x) for x in control_line[2:]] subtracted_values = [ signal_values[x] - control_values[x] for x in range(len(signal_values)) ] subtracted_values = [max(0, x) for x in subtracted_values ] # now make negative numbers 0 avg_signal = round(numpy.mean(subtracted_values), 4) new_line.append(avg_signal) average_table.append(new_line) return average_table
def map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output, ref_name, ): """Makes a table of factor density in a stitched locus. Rank table by number of loci stitched together. """ print("FORMATTING TABLE") loci = list(stitched_collection.get_loci()) locus_table = [[ "REGION_ID", "CHROM", "START", "STOP", "NUM_LOCI", "CONSTITUENT_SIZE" ]] loci_len_list = [] # strip out any that are in chrY for locus in loci: if locus.chr == "chrY": loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.id.split('_')[1])) loci_len_list.append(locus.len()) # numOrder = order(numLociList,decreasing=True) len_order = utils.order(loci_len_list, decreasing=True) ticker = 0 for i in len_order: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus ref_enrich_size = 0 ref_overlapping_loci = reference_collection.get_overlap(locus, "both") for ref_locus in ref_overlapping_loci: ref_enrich_size += ref_locus.len() try: stitch_count = int(locus.id.split("_")[0]) except ValueError: stitch_count = 1 coords = [int(x) for x in locus.coords()] locus_table.append([ locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size, ]) print("GETTING MAPPED DATA") print("USING A bam_file LIST:") print(bam_file_list) for bam_file in bam_file_list: bam_file_name = os.path.basename(bam_file) print("GETTING MAPPING DATA FOR {}".format(bam_file)) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF mapped_gff_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(ref_name, bam_file_name), "matrix.txt") print("OPENING {}".format(mapped_gff_file)) mapped_gff = utils.parse_table(mapped_gff_file, "\t") signal_dict = defaultdict(float) print("MAKING SIGNAL DICT FOR {}".format(bam_file)) mapped_loci = [] for line in mapped_gff[1:]: chrom = line[1].split("(")[0] start = int(line[1].split(":")[-1].split("-")[0]) end = int(line[1].split(":")[-1].split("-")[1]) mapped_loci.append(utils.Locus(chrom, start, end, ".", line[0])) try: signal_dict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print("WARNING NO SIGNAL FOR LINE:") print(line) continue mapped_collection = utils.LocusCollection(mapped_loci, 500) locus_table[0].append(bam_file_name) for i in range(1, len(locus_table)): signal = 0.0 line = locus_table[i] line_locus = utils.Locus(line[1], line[2], line[3], ".") overlapping_regions = mapped_collection.get_overlap(line_locus, sense="both") for region in overlapping_regions: signal += signal_dict[region.id] locus_table[i].append(signal) utils.unparse_table(locus_table, output, "\t")
def main(): """Main run method for enhancer promoter contribution tool.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs="*", help="Enter a space separated list of .bam files for the main factor", required=True, ) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True, ) parser.add_argument( "-g", "--genome", dest="genome", type=str, help=( "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently " "supported"), required=True, ) parser.add_argument( "-p", "--chrom-path", dest="chrom_path", type=str, help=("Provide path to a folder with a seperate fasta file for each " "chromosome"), required=True, ) # output flag parser.add_argument( "-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True, ) # additional options flags and optional arguments parser.add_argument( "-a", "--activity", dest="activity", type=str, help=("specify a table where first column represents a list of active " "refseq genes"), required=False, ) parser.add_argument( "-c", "--control", dest="control", nargs="*", help=("Enter a space separated list of .bam files for background. If " "flagged, will perform background subtraction"), required=False, ) parser.add_argument( "-t", "--tss", dest="tss", type=int, help="Define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000, ) parser.add_argument( "-d", "--distal", dest="distal", type=int, help="Enter a window to assign distal enhancer signal. Default is 50kb", required=False, default=50000, ) parser.add_argument( "--other-bams", dest="other", nargs="*", help="enter a space separated list of other bams to map to", required=False, ) parser.add_argument( "--name", dest="name", type=str, help= ("enter a root name for the analysis, otherwise will try to find the " "name from the input file"), required=False, ) parser.add_argument( "--top", dest="top", type=int, help= ("Run the analysis on the top N genes by total signal. Default is 5000" ), required=False, default=5000, ) parser.add_argument( "--tads", dest="tads", type=str, help= ("Include a .bed of tad regions to restrict enhancer/gene association" ), required=False, default=None, ) parser.add_argument( "--mask", dest="mask", default=None, help=( "Mask a set of regions from analysis. Provide a .bed or .gff of " "masking regions"), ) args = parser.parse_args() print(args) # ===================================================================================== # ===============================I. PARSING ARGUMENTS================================== # ===================================================================================== print( "\n\n#======================================\n#===========I. DATA SUMMARY============\n#=" "=====================================\n") # top analysis subset top = args.top # input genome genome = args.genome.upper() print("PERFORMING ANALYSIS ON {} GENOME BUILD".format(genome)) # set of bams bam_file_list = args.bam # bring in the input path input_path = args.input # try to get the input name or use the name argument if args.name: analysis_name = args.name else: analysis_name = os.path.basename(input_path).split(".")[0] print("USING {} AS ANALYSIS NAME".format(analysis_name)) # setting up the output folder parent_folder = utils.format_folder(args.output, True) output_folder = utils.format_folder( os.path.join(parent_folder, analysis_name), True) print("WRITING OUTPUT TO {}".format(output_folder)) if input_path.split(".")[-1] == "bed": # type is bed print("input in bed format, converting to gff") input_gff = utils.bed_to_gff(input_path) else: input_gff = utils.parse_table(input_path, "\t") # the tss window for proximal signal assignment tss_window = int(args.tss) # the distal window for assigning nearby enhancer signal distal_window = int(args.distal) # activity path if args.activity: activity_path = args.activity activity_table = utils.parse_table(activity_path, "\t") ref_col = 0 # try to find the column for refseq id for i in range(len( activity_table[2])): # use an internal row in case of header if str(activity_table[1][i]).count("NM_") or str( activity_table[1][i]).count("NR_"): ref_col = i # now check for header if not str(activity_table[0][i]).count("NM_") and not str( activity_table[0][i]).count("NR_"): print("REMOVING HEADER FROM GENE TABLE:") print(activity_table[0]) activity_table.pop(0) gene_list = [line[ref_col] for line in activity_table ] # this needs to be REFSEQ NM ID print("IDENTIFIED {} ACTIVE GENES".format(len(gene_list))) else: gene_list = [] # check if tads are being invoked if args.tads: print("LOADING TAD LOCATIONS FROM {}".format(args.tads)) tads_path = args.tads else: tads_path = "" print("LOADING ANNOTATION DATA FOR GENOME {}".format(genome)) genome_dir = args.chrom_path # making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify( [name.split(".")[0] for name in os.listdir(genome_dir) if name]) # important here to define the window start_dict, tss_collection, mouse_convert_dict = load_annot_file( genome, tss_window, gene_list, ) print("FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES") print(chrom_list) filtered_gff = [line for line in input_gff if chrom_list.count(line[0])] print("{} of INITIAL {} REGIONS ARE IN GOOD CHROMOSOMES".format( str(len(filtered_gff)), str(len(input_gff)), )) # ===================================================================================== # ================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS===================== # ===================================================================================== print( "\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#=" "=====================================\n") # now we need to split the input region print("SPLITTING THE INPUT GFF USING A WINDOW OF {}".format(tss_window)) split_gff = split_regions(filtered_gff, tss_collection, mask_file=args.mask) print(len(filtered_gff)) print(len(split_gff)) split_gff_path = os.path.join(output_folder, "{}_SPLIT.gff".format(analysis_name)) utils.unparse_table(split_gff, split_gff_path, "\t") print("WRITING TSS SPLIT GFF OUT TO {}".format(split_gff_path)) # now you have to map the bams to the gff print("MAPPING TO THE SPLIT GFF") mapped_folder = utils.format_folder( os.path.join(output_folder, "bam_mapping"), True) signal_table = map_bams(bam_file_list, split_gff_path, analysis_name, mapped_folder) signal_table_path = os.path.join( output_folder, "{}_signal_table.txt".format(analysis_name)) utils.unparse_table(signal_table, signal_table_path, "\t") if args.control: control_bam_file_list = args.control control_signal_table = map_bams( control_bam_file_list, split_gff_path, analysis_name, mapped_folder, ) control_signal_table_path = os.path.join( output_folder, "{}_control_signal_table.txt".format(analysis_name), ) utils.unparse_table(control_signal_table, control_signal_table_path, "\t") # now create the background subtracted summarized average table print("CREATING AN AVERAGE SIGNAL TABLE") average_table = make_average_table( output_folder, analysis_name, use_background=args.control # TODO: fix to True or False ) average_table_path = os.path.join( output_folder, "{}_average_table.txt".format(analysis_name)) utils.unparse_table(average_table, average_table_path, "\t") # now load up all of the cpg and other parameters to make the actual peak table # first check if this has already been done peak_table_path = os.path.join(output_folder, "{}_PEAK_TABLE.txt".format(analysis_name)) if utils.check_output(peak_table_path, 0.1, 0.1): print("PEAK TABLE OUTPUT ALREADY EXISTS") peak_table = utils.parse_table(peak_table_path, "\t") else: peak_table = make_peak_table( param_dict, split_gff_path, average_table_path, start_dict, gene_list, genome_dir, tss_window, distal_window, tads_path, ) utils.unparse_table(peak_table, peak_table_path, "\t") gene_table = make_gene_table(peak_table, analysis_name) gene_table_path = os.path.join(output_folder, "{}_GENE_TABLE.txt".format(analysis_name)) utils.unparse_table(gene_table, gene_table_path, "\t") # if mouse, need to convert genes over if genome.count("MM") == 1: print("CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA") converted_gene_table_path = os.path.join( output_folder, "{}_GENE_TABLE_CONVERTED.txt".format(analysis_name), ) converted_gene_table = [gene_table[0]] for line in gene_table[1:]: converted_name = mouse_convert_dict[line[0]] if converted_name: converted_gene_table.append([converted_name] + line[1:]) utils.unparse_table(converted_gene_table, converted_gene_table_path, "\t") gene_table_path = converted_gene_table_path gene_table = converted_gene_table # ===================================================================================== # ===================================III. PLOTTING ==================================== # ===================================================================================== print( "\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#==" "====================================\n") # if there are fewer genes in the gene table than the top genes, only run on all if len(gene_table) < int(top): print( "WARNING: ONLY {} GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO" "RUN ANALYSIS ON TOP {}".format(str(len(gene_table) - 1), str(top))) top = 0 # now call the R code print("CALLING R PLOTTING SCRIPTS") call_r_waterfall(gene_table_path, output_folder, analysis_name, top)
def make_peak_table( param_dict, split_gff_path, average_table_path, start_dict, gene_list, genome_directory, tss_window, distal_window, tads_path="", ): """Makes the final peak table with ebox info.""" peak_table = [[ "REGION_ID", "CHROM", "START", "STOP", "LENGTH", "TSS", "CPG", "CPG_FRACTION", "GC_FREQ", "SIGNAL", "CANON_EBOX_COUNT", "NON_CANON_EBOX_COUNT", "TOTAL_EBOX_COUNT", "OVERLAPPING_GENES", "PROXIMAL_GENES", ]] print("LOADING PEAK REGIONS") peak_gff = utils.parse_table(split_gff_path, "\t") print("LOADING BINDING DATA") signal_table = utils.parse_table(average_table_path, "\t") print("LOADING CPGS ISLANDS") cpg_bed = utils.parse_table(param_dict["cpg_path"], "\t") cpg_loci = [] for line in cpg_bed: cpg_loci.append(utils.Locus(line[0], line[1], line[2], ".", line[-1])) cpg_collection = utils.LocusCollection(cpg_loci, 50) print("MAKING TSS COLLECTIONS") if not gene_list: gene_list = [*start_dict] tss_prox_loci = [] tss_distal_loci = [] for ref_id in gene_list: tss_prox_loci.append( utils.make_tss_locus(ref_id, start_dict, tss_window, tss_window)) tss_distal_loci.append( utils.make_tss_locus( ref_id, start_dict, distal_window, distal_window, )) # make a 1kb flanking and 50kb flanking collection tss_prox_collection = utils.LocusCollection(tss_prox_loci, 50) tss_distal_collection = utils.LocusCollection(tss_distal_loci, 50) if tads_path: print("LOADING TADS FROM {}".format(tads_path)) tad_collection = utils.import_bound_region(tads_path, "tad") use_tads = True # building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_prox_loci: overlapping_tads = tad_collection.get_overlap(tss_locus, "both") for tad_locus in overlapping_tads: tad_dict[tad_locus.id].append(tss_locus.id) else: use_tads = False print("CLASSIFYING PEAKS") ticker = 0 no_tad_count = 0 for i in range(len(peak_gff)): if not ticker % 1000: print(ticker) ticker += 1 # getting the particulars of the region gff_line = peak_gff[i] peak_id = gff_line[1] chrom = gff_line[0] start = int(gff_line[3]) stop = int(gff_line[4]) line_locus = utils.Locus(chrom, start, stop, ".", peak_id) # getting the mapped signal signal_line = signal_table[(i + 1)] signal_vector = [float(x) for x in signal_line[2:]] # setting up the new line new_line = [peak_id, chrom, start, stop, line_locus.len()] # get the tss status from the gff itself # (we are able to do this nicely from the split gff code earlier) new_line.append(gff_line[7]) # check cpg status if cpg_collection.get_overlap(line_locus, "both"): new_line.append(1) else: new_line.append(0) # now do fractional cpgoverlap overlapping_cpg_loci = cpg_collection.get_overlap(line_locus, "both") overlapping_bases = 0 for locus in overlapping_cpg_loci: cpg_start = max(locus.start, line_locus.start) cpg_end = min(locus.end, line_locus.end) overlapping_bases += cpg_end - cpg_start overlap_fraction = float(overlapping_bases) / line_locus.len() new_line.append(round(overlap_fraction, 2)) # now get the seq line_seq = utils.fetch_seq(genome_directory, chrom, start, stop, True).upper() if not line_seq: print("UH OH") print(line_seq) print(gff_line) print(i) print(chrom) print(start) print(stop) sys.exit() gc_freq = float(line_seq.count("GC") + line_seq.count("CG")) / len(line_seq) new_line.append(gc_freq) # this is where we add the ChIP-seq signal new_line += signal_vector ebox_match_list = re.findall("CA..TG", line_seq) if not ebox_match_list: new_line += [0] * 3 else: total_count = len(ebox_match_list) canon_count = ebox_match_list.count("CACGTG") other_count = total_count - canon_count new_line += [canon_count, other_count, total_count] # now find the overlapping and proximal genes # here each overlapping gene the tss prox locus overlaps the peak if use_tads: tad_loci = tad_collection.get_overlap(line_locus, "both") tad_id_list = [tad_locus.id for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if not tad_genes: no_tad_count += 1 else: tad_genes = [] if tad_genes: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") if tad_genes.count(locus.id) ] else: overlapping_genes = [ start_dict[locus.id]["name"] for locus in tss_prox_collection.get_overlap( line_locus, "both") ] proximal_genes = [ start_dict[locus.id]["name"] for locus in tss_distal_collection.get_overlap( line_locus, "both") ] overlapping_genes = utils.uniquify(overlapping_genes) # here the tss 50kb locus overlaps the peak # overlap takes priority over proximal proximal_genes = [ gene for gene in proximal_genes if not overlapping_genes.count(gene) ] proximal_genes = utils.uniquify(proximal_genes) overlapping_string = ",".join(overlapping_genes) proximal_string = ",".join(proximal_genes) new_line += [overlapping_string, proximal_string] peak_table.append(new_line) print("Out of {} regions, {} were assigned to at least 1 tad".format( str(len(peak_table)), str(no_tad_count), )) return peak_table
def main(): """Main run call.""" debug = False parser = argparse.ArgumentParser() # required flags parser.add_argument( "-i", "--i", dest="input", required=True, help= ("Enter a comma separated list of .gff or .bed file of binding sites used to make " "enhancers"), ) parser.add_argument( "-r", "--rankby", dest="rankby", required=True, help="Enter a comma separated list of bams to rank by", ) parser.add_argument("-o", "--out", dest="out", required=True, help="Enter an output folder") parser.add_argument( "-g", "--genome", dest="genome", required=True, help="Enter the genome build (MM9,MM8,HG18,HG19)", ) # optional flags parser.add_argument( "-n", "--name", dest="name", required=False, help="Provide a name for the analysis otherwise ROSE will guess", ) parser.add_argument( "-c", "--control", dest="control", required=False, help= ("Enter a comma separated list of control bams. Can either provide a single control " "bam for all rankby bams, or provide a control bam for each individual bam" ), ) parser.add_argument( "-s", "--stitch", dest="stitch", default="", help= ("Enter a max linking distance for stitching. Default will determine optimal stitching" " parameter"), ) parser.add_argument( "-t", "--tss", dest="tss", default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", ) parser.add_argument( "--mask", dest="mask", required=False, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions", ) # RETRIEVING FLAGS args = parser.parse_args() # making the out folder if it doesn't exist out_folder = utils.format_folder(args.out, True) # figuring out folder schema gff_folder = utils.format_folder(os.path.join(out_folder, "gff"), True) mapped_folder = utils.format_folder(os.path.join(out_folder, "mappedGFF"), True) # GETTING INPUT FILE(s) input_list = [ input_file for input_file in args.input.split(",") if len(input_file) > 1 ] # converting all input files into GFFs and moving into the GFF folder input_gf_list = [] for input_file in input_list: # GETTING INPUT FILE if args.input.split(".")[-1] == "bed": # CONVERTING A BED TO GFF input_gff_name = os.path.basename(args.input)[0:-4] input_gff_file = os.path.join(gff_folder, "{}.gff".format(input_gff_name)) utils.bed_to_gff(args.input, input_gff_file) elif args.input.split(".")[-1] == "gff": # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) else: print( "WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT" ) # COPY THE INPUT GFF TO THE GFF FOLDER input_gff_file = args.input copyfile( input_gff_file, os.path.join(gff_folder, os.path.basename(input_gff_file)), ) input_gf_list.append(input_gff_file) # GETTING THE LIST OF bam_fileS TO PROCESS # either same number of bams for rankby and control # or only 1 control #or none! # bamlist should be all rankby bams followed by control bams bam_file_list = [] if args.control: control_bam_list = [ bam for bam in args.control.split(",") if len(bam) > 0 ] rankby_bam_list = [ bam for bam in args.rankby.split(",") if len(bam) > 0 ] if len(control_bam_list) == len(rankby_bam_list): # case where an equal number of backgrounds are given bam_file_list = rankby_bam_list + control_bam_list elif len(control_bam_list) == 1: # case where a universal background is applied bam_file_list = rankby_bam_list + control_bam_list * len( rankby_bam_list) else: print( "ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM" " FOR EACH SAMPLE") sys.exit() else: bam_file_list = [bam for bam in args.rankby.split(",") if len(bam) > 0] # Stitch parameter if args.stitch == "": stitch_window = "" else: stitch_window = int(args.stitch) # tss args tss_window = int(args.tss) if tss_window != 0: remove_tss = True else: remove_tss = False # GETTING THE GENOME genome = args.genome.upper() print("USING {} AS THE GENOME".format(genome)) # GETTING THE CORRECT ANNOT FILE try: annot_file = rose2_utils.genome_dict[genome] except KeyError: print("ERROR: UNSUPPORTED GENOMES TYPE {}".format(genome)) sys.exit() # FINDING THE ANALYSIS NAME if args.name: input_name = args.name else: input_name = os.path.basename(input_gf_list[0]).split(".")[0] print("USING {} AS THE ANALYSIS NAME".format(input_name)) print("FORMATTING INPUT REGIONS") # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs # use a simpler unique region naming system if len(input_gf_list) == 1: input_gff = utils.parse_table(input_gf_list[0], "\t") else: input_loci = [] for gff_file in input_gf_list: print("\tprocessing {}".format(gff_file)) gff = utils.parse_table(gff_file, "\t") gff_collection = utils.gff_to_locus_collection(gff, 50) input_loci += gff_collection.get_loci() input_collection = utils.LocusCollection(input_loci, 50) input_collection = (input_collection.stitch_collection() ) # stitches to produce unique regions input_gff = utils.locus_collection_to_gff(input_collection) formatted_gff = [] # now number things appropriately for i, line in enumerate(input_gff): # use the coordinates to make a new id input_name_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] line_id = "{}_{}".format(input_name, str(i + 1)) # 1 indexing new_line = [ chrom, line_id, line_id, min(coords), max(coords), "", sense, "", line_id, ] formatted_gff.append(new_line) # name of the master input gff file master_gff_file = os.path.join( gff_folder, "{}_{}_ALL_-0_+0.gff".format(genome, input_name)) utils.unparse_table(formatted_gff, master_gff_file, "\t") print("USING {} AS THE INPUT GFF".format(master_gff_file)) # GET CHROMS FOUND IN THE BAMS print("GETTING CHROMS IN bam_fileS") bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # LOADING IN THE GFF AND FILTERING BY CHROM print("LOADING AND FILTERING THE GFF") input_gff = rose2_utils.filter_gff(master_gff_file, bam_chrom_list) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print("LOADING IN GFF REGIONS") reference_collection = utils.gff_to_locus_collection(input_gff) print("CHECKING REFERENCE COLLECTION:") rose2_utils.check_ref_collection(reference_collection) # MASKING REFERENCE COLLECTION # see if there's a mask if args.mask: mask_file = args.mask # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bedToGFF(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() mask_collection = utils.gff_to_locus_collection(mask_gff) # now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if len(mask_collection.get_overlap(locus, "both")) == 0 ] print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}".format( len(reference_loci) - len(filtered_loci), mask_file)) reference_collection = utils.LocusCollection(filtered_loci, 50) # NOW STITCH REGIONS print("STITCHING REGIONS TOGETHER") stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # NOW MAKE A STITCHED COLLECTION GFF print("MAKING GFF FROM STITCHED COLLECTION") stitched_gff = utils.locus_collection_to_gff(stitched_collection) print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.gff".format(input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED.debug".format(input_name, str(stitch_window // 1000)), ) else: stitched_gff_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.gff".format( input_name, str(stitch_window // 1000)), ) stitched_gff_name = "{}_{}KB_STITCHED_TSS_DISTAL".format( input_name, str(stitch_window // 1000)) debug_out_file = os.path.join( gff_folder, "{}_{}KB_STITCHED_TSS_DISTAL.debug".format( input_name, str(stitch_window // 1000)), ) # WRITING DEBUG OUTPUT TO DISK if debug: print("WRITING DEBUG OUTPUT TO DISK AS {}".format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, "\t") # WRITE THE GFF TO DISK print("WRITING STITCHED GFF TO DISK AS {}".format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, "\t") # SETTING UP THE OVERALL OUTPUT FILE output_file1 = os.path.join( out_folder, "{}_ENHANCER_REGION_MAP.txt".format(stitched_gff_name)) print("OUTPUT WILL BE WRITTEN TO {}".format(output_file1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = os.path.basename(bam_file) # MAPPING TO THE STITCHED GFF mapped_out1_folder = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name)) mapped_out1_file = os.path.join( mapped_folder, "{}_{}_MAPPED".format(stitched_gff_name, bam_file_name), "matrix.txt", ) if utils.check_output(mapped_out1_file, 0.2, 0.2): print("FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out1_file)) else: cmd1 = "bamliquidator_batch --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( stitched_gff_file, mapped_out1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out1_file, 0.2, 5): print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) else: print("ERROR: FAILED TO MAP {} FROM BAM: {}".format( stitched_gff_file, bam_file_name)) sys.exit() print("BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS") # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file1, ref_name=stitched_gff_name, ) print("FINDING AVERAGE SIGNAL AMONGST BAMS") meta_output_file = collapse_region_map(output_file1, input_name + "_MERGED_SIGNAL", control_bams=args.control) # now try the merging print("CALLING AND PLOTTING SUPER-ENHANCERS") control_name = "NONE" cmd = "Rscript {} {} {} {} {}".format( os.path.join(ROOT_DIR, "scripts", "ROSE2_callSuper.R"), out_folder + "/", # TODO: fix R script so it does not require '/' meta_output_file, input_name, control_name, ) print(cmd) os.system(cmd) # calling the gene mapper print("CALLING GENE MAPPING") super_table_file = "{}_SuperEnhancers.table.txt".format(input_name) # for now don't use ranking bam to call top genes cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, super_table_file)) print(cmd) os.system(cmd) stretch_table_file = "{}_StretchEnhancers.table.txt".format(input_name) cmd = "ROSE2_geneMapper -g {} -i {} -f".format( genome, os.path.join(out_folder, stretch_table_file)) print(cmd) os.system(cmd) superstretch_table_file = "{}_SuperStretchEnhancers.table.txt".format( input_name) cmd = "ROSE2_geneMapper.py -g {} -i {} -f".format(genome, out_folder, superstretch_table_file) os.system(cmd)
def split_regions(input_gff, tss_collection, mask_file=None): """Split regions if even a single coordinate is shared with the +/-1kb.""" # create mask regions collection if mask_file: print("USING MASK FILE {}".format(mask_file)) # if it's a bed file if mask_file.split(".")[-1].upper() == "BED": mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split(".")[-1].upper() == "GFF": mask_gff = utils.parse_table(mask_file, "\t") else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print("LOADING {} MASK REGIONS".format(len(mask_collection))) split_gff = [] for line in input_gff: chrom = line[0] region_id = line[1] line_locus = utils.Locus(line[0], line[3], line[4], ".") # mask regions if mask_file: if mask_collection.get_overlap(line_locus, "both"): continue overlapping_loci = tss_collection.get_overlap(line_locus) if overlapping_loci: # case where a tss overlap # identify the parts of the line locus that are contained local_tss_collection = utils.LocusCollection(overlapping_loci, 50) overlapping_coords = line_locus.coords() for tss_locus in overlapping_loci: overlapping_coords += tss_locus.coords() overlapping_coords = utils.uniquify(overlapping_coords) overlapping_coords.sort() # you need to hack and slash add 1 to the last coordinate of the overlapping_coords overlapping_coords[-1] += 1 i = 0 region_ticker = 1 while i < (len(overlapping_coords) - 1): start = int(overlapping_coords[i]) stop = int(overlapping_coords[(i + 1)]) - 1 if (stop - start) < 50: # this eliminates really tiny regions i += 1 continue split_locus = utils.Locus(chrom, start + 1, stop, ".") if line_locus.overlaps(split_locus): new_id = "{}_{}".format(region_id, region_ticker) tss_status = 0 if local_tss_collection.get_overlap(split_locus): tss_status = 1 split_gff_line = [ chrom, new_id, new_id, start, stop, "", ".", tss_status, new_id, ] split_gff.append(split_gff_line) region_ticker += 1 i += 1 else: line[7] = 0 split_gff.append(line) return split_gff
def main(): """Main run function.""" parser = argparse.ArgumentParser() # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs="*", help="Enter a comma/space separated list of .bam files to be processed.", required=True, ) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True, ) parser.add_argument( "-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True, ) # output flag parser.add_argument( "-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True, ) # additional options parser.add_argument( "--stretch-input", dest="stretch_input", default=None, type=int, help=( "Stretch the input regions to a minimum length in bp, e.g. 10000 (for" " 10kb)" ), ) parser.add_argument( "-c", "--color", dest="color", default=None, nargs="*", help=( "Enter a colon or space separated list of colors e.g. " "255,0,0:255,125,0, default samples the rainbow" ), ) parser.add_argument( "-s", "--sense", dest="sense", default="both", help="Map to '+','-' or 'both' strands. Default maps to both.", ) parser.add_argument( "-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp", ) parser.add_argument( "-r", "--rpm", dest="rpm", action="store_true", default=False, help="Normalizes density to reads per million (rpm) Default is False", ) parser.add_argument( "-y", "--yScale", dest="y_scale", default="relative", help=( "Choose either relative or uniform y axis scaling. options = " "'relative,uniform' Default is relative scaling" ), ) parser.add_argument( "-n", "--names", dest="names", default=None, nargs="*", help="Enter a comma or space separated list of names for your bams", ) parser.add_argument( "-p", "--plot", dest="plot", default="MULTIPLE", help=( "Choose either all lines on a single plot or multiple plots. options " "= 'SINGLE,MULTIPLE,MERGE'" ), ) parser.add_argument( "-t", "--title", dest="title", default="", help=( "Specify a title for the output plot(s), default will be the " "coordinate region" ), ) parser.add_argument( "-q", "--skip-cache", dest="skip_cache", action="store_true", default=False, help="Toggles option to skip loading annotation cache file", ) parser.add_argument( "--scale", dest="scale", default=None, nargs="*", help=( "Enter a comma or space separated list of scaling factors for your " "bams. Default is none" ), ) parser.add_argument( "--bed", dest="bed", nargs="*", help="Add a comma-delimited or space-delimited list of bed files to plot", ) parser.add_argument( "--multi-page", dest="multi", action="store_true", default=False, help="If flagged will create a new pdf for each region", ) # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument( "--save-temp", dest="save", action="store_true", default=False, help="If flagged will save temporary files made by bamPlot", ) args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bam_file_list = args.bam if len(bam_file_list) == 1: bam_file_list = bam_file_list[0].split(",") # Make sure these are actually files & readable (!) for filename in bam_file_list: assert os.access(filename, os.R_OK) # bringing in any beds if args.bed: bed_file_list = args.bed if len(bed_file_list) == 1: bed_file_list = bed_file_list[0].split(",") print(bed_file_list) bed_collection = make_bed_collection(bed_file_list) else: bed_collection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {"+", "-", "."} if os.access(args.input, os.R_OK): if args.input.endswith(".bed"): # Uniquely graph every input of this bed parsed_input_bed = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [ [e[0], "", args.input, e[1], e[2], "", e[5], "", ""] for e in parsed_input_bed ] except IndexError: pass if gff is None: print( "Your bed doesn't have a valid sense parameter. Defaulting to both " "strands, '.'" ) # We only take chr/start/stop and ignore everything else. gff = [ [e[0], "", args.input, e[1], e[2], "", ".", "", ""] for e in parsed_input_bed ] else: # Default to .gff, since that's the original behavior gff = utils.parse_table(args.input, "\t") gff_name = os.path.basename(args.input).split(".")[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chrom_line = args.input.split(":") try: chrom = chrom_line[0] sense = chrom_line[1] except IndexError: print("Invalid input line or inaccessible file. Try: chr1:.:1-5000") exit() assert sense in valid_sense_options [start, end] = chrom_line[2].split("-") if chrom[0:3] != "chr": print("ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT") exit() gff_line = [chrom, "", args.input, start, end, "", sense, "", ""] gff_name = "{}_{}_{}_{}".format(chrom, sense, start, end) gff = [gff_line] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print( "Stretching inputs to a minimum of: {} bp".format( str(args.stretch_input) ) ) min_length = args.stretch_input stretch_gff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < min_length: pad = int((min_length - difference) / 2) stretch_gff.append( [ e[0], e[1], e[2], int(e[3]) - pad, int(e[4]) + pad, e[5], e[6], e[7], e[8], ] ) else: stretch_gff.append(e) gff = stretch_gff # Sanity test the gff object assert all([e[6] in valid_sense_options for e in gff]) # All strands are sane # bring in the genome genome = args.genome.upper() if not ["HG18", "HG19", "HG19_RIBO", "HG38", "MM9", "MM10", "RN4", "RN6"].count( genome ): print( "ERROR: UNSUPPORTED GENOME TYPE {}. USE HG19,HG18, RN4, MM9, or MM10".format( genome, ) ) parser.print_help() exit() # bring in the rest of the options # output root_folder = args.output try: os.listdir(root_folder) except OSError: print("ERROR: UNABLE TO FIND OUTPUT DIRECTORY {}".format(root_folder)) exit() # Get analysis title if not args.title: title = gff_name else: title = args.title # make a temp folder temp_folder = os.path.join(root_folder, title) print("CREATING TEMP FOLDER {}".format(temp_folder)) utils.format_folder(temp_folder, create=True) # colors if args.color: color_list = args.color if len(color_list) == 1: color_list = color_list[0].split(":") color_list = [x.split(",") for x in color_list] if len(color_list) < len(bam_file_list): print( "WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED" ) # recycling the color list color_list += color_list * (len(bam_file_list) // len(color_list)) color_list = color_list[: len(bam_file_list)] else: # cycles through the colors of the rainbow color_list = taste_the_rainbow(len(bam_file_list)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale if scale: if len(scale) == 1: scale = scale[0].split(",") y_scale = args.y_scale.upper() # names if args.names: names = args.names if len(names) == 1: names = names[0].split(",") if len(names) != len(bam_file_list): print("ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND") parser.print_help() exit() else: names = [os.path.basename(x) for x in bam_file_list] # plot style plot_style = args.plot.upper() if not ["SINGLE", "MULTIPLE", "MERGE"].count(plot_style): print("ERROR: PLOT STYLE {} NOT AN OPTION".format(plot_style)) parser.print_help() exit() # now run! summary_table_file_name = make_bam_plot_tables( gff, genome, bam_file_list, color_list, n_bins, sense, extension, rpm, temp_folder, names, title, bed_collection, scale, ) print("{} is the summary table".format(summary_table_file_name)) # running the R command to plot multi = args.multi out_file = os.path.join(root_folder, "{}_plots.pdf".format(title)) r_cmd = call_r_plot( summary_table_file_name, out_file, y_scale, plot_style, multi ) # open a bash file bash_file_name = os.path.join(temp_folder, "{}_Rcmd.sh".format(title)) with open(bash_file_name, "w") as bash_file: bash_file.write("#!/usr/bin/bash\n") bash_file.write(r_cmd) print("Wrote R command to {}".format(bash_file_name)) os.system("bash {}".format(bash_file_name)) # delete temp files if not args.save: if utils.check_output(out_file, 1, 10): # This is super dangerous (!). Add some sanity checks. assert " " not in temp_folder assert temp_folder != "/" shutil.rmtree(temp_folder) print("Removing temp folder: {}".format(temp_folder)) else: print("ERROR: NO OUTPUT FILE {} DETECTED".format(out_file)) else: parser.print_help() sys.exit()
def make_bam_plot_tables( gff, genome, bam_file_list, color_list, n_bins, sense, extension, rpm, out_folder, names, title, bed_collection, scale=None, ): """Makes a plot table for each line of the gff mapped against all the bams in the bamList.""" # load in the gff if isinstance(gff, str): gff = utils.parse_table(gff, "\t") # load in the annotation print("loading in annotation for {}".format(genome)) gene_dict, tx_collection = load_annot_file(genome) # make an MMR dict so MMRs are only computed once print("Getting information about read depth in bams") mmr_dict = {} if scale: print("Applying scaling factors") scale_list = [float(x) for x in scale] else: scale_list = [1] * len(bam_file_list) # now iterate through the bam files for i, bam_file in enumerate(bam_file_list): # millionMappedReads idx_cmd = "samtools idxstats {}".format(bam_file) idx_pipe = subprocess.Popen( idx_cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, ) # TODO: this does not produce an error if samtools are not installed idx_stats = idx_pipe.communicate() idx_stats = idx_stats[0].decode("utf-8").split("\n") idx_stats = [line.split("\t") for line in idx_stats] raw_count = sum([int(line[2]) for line in idx_stats[:-1]]) # implement scaling read_scale_factor = scale_list[i] if rpm: mmr = round(raw_count / 1000000 / read_scale_factor, 4) else: mmr = round(1 / read_scale_factor, 4) mmr_dict[bam_file] = mmr ticker = 1 # go line by line in the gff summary_table = [ [ "DIAGRAM_TABLE", "NAME_TABLE", "BED_DIAGRAM_TABLE", "BED_NAME_TABLE", "PLOT_TABLE", "CHROM", "ID", "SENSE", "START", "END", ] ] for gff_line in gff: gff_string = "line_{}_{}_{}_{}_{}_{}".format( ticker, gff_line[0], gff_line[1], gff_line[6], gff_line[3], gff_line[4], ) ticker += 1 print("writing the gene diagram table for region {}".format(gff_line[1])) map_gff_line_to_annot( gff_line, out_folder, n_bins, gene_dict, tx_collection, sense="both", header=gff_string, ) map_gff_line_to_bed( gff_line, out_folder, n_bins, bed_collection, header=gff_string, ) out_table = [] out_table.append( ["BAM", "GENE_ID", "NAME", "LOCUSLINE", "COLOR1", "COLOR2", "COLOR3"] + ["bin_" + str(n) for n in range(1, int(n_bins) + 1, 1)] ) for i, bam_file in enumerate(bam_file_list): name = names[i] color = color_list[i] print( "getting data for location {} in dataset {}".format( gff_line[1], bam_file ) ) mmr = mmr_dict[bam_file] new_line = map_bam_to_gff_line( bam_file, mmr, name, gff_line, color, n_bins, sense, extension, ) out_table.append(new_line) # get the gene name if gff_line[1] in gene_dict: gene_name = gene_dict[gff_line[1]].common_name() else: gene_name = gff_line[1] utils.unparse_table( out_table, os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string)), "\t", ) diagram_table = os.path.join( out_folder, "{}_diagramTemp.txt".format(gff_string) ) plot_table = os.path.join(out_folder, "{}_plotTemp.txt".format(gff_string)) name_table = os.path.join(out_folder, "{}_nameTemp.txt".format(gff_string)) bed_name_table = os.path.join( out_folder, "{}_bedNameTemp.txt".format(gff_string) ) bed_diagram_table = os.path.join( out_folder, "{}_bedDiagramTemp.txt".format(gff_string) ) summary_table.append( [ diagram_table, name_table, bed_diagram_table, bed_name_table, plot_table, gff_line[0], gene_name, gff_line[6], gff_line[3], gff_line[4], ] ) summary_table_file_name = os.path.join(out_folder, "{}_summary.txt".format(title)) utils.unparse_table(summary_table, summary_table_file_name, "\t") return summary_table_file_name
def tf_edge_delta_out( crc_folder, bam_list, analysis_name, edge_table_path_1, edge_table_path_2, group1_list, group2_list, output="", ): """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks).""" crc_folder = utils.format_folder(crc_folder, True) edge_path = merge_edge_tables( edge_table_path_1, edge_table_path_2, os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)), ) # make a gff of the edge table edge_table = utils.parse_table(edge_path, "\t") edge_gff = [] for line in edge_table[1:]: gff_line = [ line[2], "{}_{}".format(line[0], line[1]), "", line[3], line[4], "", ".", "", "{}_{}".format(line[0], line[1]), ] edge_gff.append(gff_line) edge_gff_path = os.path.join(crc_folder, "{}_EDGE_TABLE.gff".format(analysis_name)) utils.unparse_table(edge_gff, edge_gff_path, "\t") # direct the output to the crc folder signal_path = os.path.join( crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name)) all_group_list = group1_list + group2_list if not utils.check_output(signal_path, 0, 0): signal_table_list = pipeline_utils.map_regions( bam_list, [edge_gff_path], crc_folder, crc_folder, all_group_list, True, signal_path, extend_reads_to=100, ) print(signal_table_list) else: print("Found previous signal table at {}".format(signal_path)) # now bring in the signal table as a dictionary using the locus line as the id print("making log2 group1 vs group2 signal table at edges") signal_table = utils.parse_table(signal_path, "\t") # figure out columns for group1 and group2 group1_columns = [signal_table[0].index(name) for name in group1_list] group2_columns = [signal_table[0].index(name) for name in group2_list] group1_signal_vector = [] group2_signal_vector = [] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) group1_signal_vector.append(group1_signal) group2_signal_vector.append(group2_signal) group1_median = numpy.median(group1_signal_vector) group2_median = numpy.median(group2_signal_vector) print("group1 median signal") print(group1_median) print("group2 median signal") print(group2_median) # now that we have the median, we can take edges where at least 1 edge is above the median # and both are above zero and generate a new table w/ the fold change signal_filtered_path = signal_path.replace(".txt", "_filtered.txt") if utils.check_output(signal_filtered_path, 0, 0): print("Found filtered signal table for edges at {}".format( signal_filtered_path)) signal_table_filtered = utils.parse_table(signal_filtered_path, "\t") else: signal_table_filtered = [ signal_table[0] + ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"] ] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) if (group1_signal > group1_median or group2_signal > group2_median ) and min(group1_signal, group2_signal) > 0: delta = numpy.log2(group1_signal / group2_signal) new_line = line + [group1_signal, group2_signal, delta] signal_table_filtered.append(new_line) utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t") # now get a list of all TFs in the system tf_list = utils.uniquify( [line[0].split("_")[0] for line in signal_table_filtered[1:]]) tf_list.sort() print(tf_list) out_degree_table = [[ "TF_NAME", "EDGE_COUNT", "DELTA_MEAN", "DELTA_MEDIAN", "DELTA_STD", "DELTA_SEM", ]] for tf_name in tf_list: print(tf_name) edge_vector = [ float(line[-1]) for line in signal_table_filtered[1:] if line[0].split("_")[0] == tf_name ] edge_count = len(edge_vector) delta_mean = round(numpy.mean(edge_vector), 4) delta_median = round(numpy.median(edge_vector), 4) delta_std = round(numpy.std(edge_vector), 4) delta_sem = round(stats.sem(edge_vector), 4) tf_out_line = [ tf_name, edge_count, delta_mean, delta_median, delta_std, delta_sem, ] out_degree_table.append(tf_out_line) # set final output if not output: output_path = os.path.join( crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name)) else: output_path = output utils.unparse_table(out_degree_table, output_path, "\t") print(output_path) return output_path