def map_enhancer_to_gene_top(rank_by_bam_file, control_bam_file, genome, annot_file, enhancer_file, transcribed_file=None, unique_genes=True, search_window=50000, no_format_table=False): """Maps genes to enhancers. If unique_genes, reduces to gene name only. Otherwise, gives for each refseq. """ start_dict = utils.make_start_dict(annot_file) enhancer_name = enhancer_file.split('/')[-1].split('.')[0] enhancer_table = utils.parse_table(enhancer_file, '\t') if transcribed_file: transcribed_table = utils.parse_table(transcribed_file, '\t') transcribed_genes = [line[1] for line in transcribed_table] else: transcribed_genes = start_dict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribed_collection = utils.make_transcript_collection( annot_file, 0, 0, 500, transcribed_genes) print('MAKING TSS COLLECTION') tss_loci = [] for gene_id in transcribed_genes: tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0)) # This turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)} # Dictionaries to hold ranks and super_status of gene nearby enhancers rank_dict = defaultdict(list) super_dict = defaultdict(list) # List of all genes that appear in this analysis overall_gene_list = [] # Find the header for line in enhancer_table: if line[0][0] != '#': header = line print('this is the header') print(header) break if no_format_table: # Set up the output tables # First by enhancer enhancer_to_gene_table = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # Set up the output tables # First by enhancer enhancer_to_gene_table = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # Next make the gene to enhancer table gene_to_enhancer_table = [ [ 'GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL', ] ] for line in enhancer_table: if line[0][0] == '#' or line[0][0] == 'R': continue enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3]) enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # Overlapping genes are transcribed genes whose transcript is directly in the # stitched_locus overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both') overlapping_genes = [] for overlap_locus in overlapping_loci: overlapping_genes.append(overlap_locus.id) # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the # stitched loci proximal_loci = tss_collection.get_overlap( utils.make_search_locus(enhancer_locus, search_window, search_window), 'both', ) proximal_genes = [] for prox_locus in proximal_loci: proximal_genes.append(prox_locus.id) distal_loci = tss_collection.get_overlap( utils.make_search_locus(enhancer_locus, 1000000, 1000000), 'both', ) distal_genes = [] for prox_locus in distal_loci: distal_genes.append(prox_locus.id) overlapping_genes = utils.uniquify(overlapping_genes) proximal_genes = utils.uniquify(proximal_genes) distal_genes = utils.uniquify(distal_genes) all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes # These checks make sure each gene list is unique # Technically it is possible for a gene to be overlapping, but not proximal since the gene # could be longer than the 50kb window, but we'll let that slide here for ref_id in overlapping_genes: if proximal_genes.count(ref_id) == 1: proximal_genes.remove(ref_id) for ref_id in proximal_genes: if distal_genes.count(ref_id) == 1: distal_genes.remove(ref_id) # Now find the closest gene if not all_enhancer_genes: closest_gene = '' else: # Get enhancer_center enhancer_center = (int(line[2]) + int(line[3])) / 2 # Get absolute distance to enhancer center dist_list = [abs(enhancer_center - start_dict[gene_id]['start'][0]) for gene_id in all_enhancer_genes] # Get the ID and convert to name closest_gene = start_dict[all_enhancer_genes[dist_list.index(min(dist_list))]]['name'] # Now write the row for the enhancer table if no_format_table: new_enhancer_line = list(line) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) ) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) ) new_enhancer_line.append(closest_gene) else: new_enhancer_line = line[0:9] new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) ) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) ) new_enhancer_line.append(closest_gene) new_enhancer_line += line[-2:] enhancer_to_gene_table.append(new_enhancer_line) # Now grab all overlapping and proximal genes for the gene ordered table overall_gene_list += overlapping_genes for ref_id in overlapping_genes: gene_dict['overlapping'][ref_id].append(enhancer_string) rank_dict[ref_id].append(int(line[-2])) super_dict[ref_id].append(int(line[-1])) overall_gene_list += proximal_genes for ref_id in proximal_genes: gene_dict['proximal'][ref_id].append(enhancer_string) rank_dict[ref_id].append(int(line[-2])) super_dict[ref_id].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overall_gene_list = utils.uniquify(overall_gene_list) # Get the chrom_lists from the various bams here cmd = 'samtools idxstats {}'.format(rank_by_bam_file) idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) idx_stats = idx_stats.communicate() bam_chrom_list = [ line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] ] if control_bam_file: cmd = 'samtools idxstats {}'.format(control_bam_file) idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) idx_stats = idx_stats.communicate() bam_chrom_list_control = [ line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] ] bam_chrom_list = [ chrom for chrom in bam_chrom_list if bam_chrom_list_control.count(chrom) != 0 ] # Now make sure no genes have a bad chrom overall_gene_list = [ gene for gene in overall_gene_list if bam_chrom_list.count(start_dict[gene]['chr']) != 0 ] # Now make an enhancer collection of all transcripts enhancer_gene_collection = utils.make_transcript_collection( annot_file, 5000, 5000, 500, overall_gene_list, ) enhancer_gene_gff = utils.locus_collection_to_gff(enhancer_gene_collection) # Dump the gff to file enhancer_folder = utils.get_parent_folder(enhancer_file) gff_root_name = "{}_TSS_ENHANCER_GENES_-5000_+5000".format(genome) enhancer_gene_gff_file = "{}{}_{}.gff".format(enhancer_folder, enhancer_name, gff_root_name) utils.unparse_table(enhancer_gene_gff, enhancer_gene_gff_file, '\t') # Now we need to run bam_to_gff # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local # (in path), otherwise fail bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # Map density at genes in the +/- 5kb tss region # First on the rank_by bam bam_name = rank_by_bam_file.split('/')[-1] mapped_rank_by_folder = "{}{}_{}_{}/".format( enhancer_folder, enhancer_name, gff_root_name, bam_name, ) mapped_rank_by_file = "{}{}_{}_{}/matrix.txt".format( enhancer_folder, enhancer_name, gff_root_name, bam_name, ) cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format( bamliquidator_path, enhancer_gene_gff_file, mapped_rank_by_folder, rank_by_bam_file, ) print("Mapping rankby bam {}".format(rank_by_bam_file)) print(cmd) os.system(cmd) # Check for completion if utils.check_output(mapped_rank_by_file, 0.2, 5): print( "SUCCESSFULLY MAPPED TO {} FROM BAM: {}" "".format(enhancer_gene_gff_file, rank_by_bam_file) ) else: print( "ERROR: FAILED TO MAP {} FROM BAM: {}" "".format(enhancer_gene_gff_file, rank_by_bam_file) ) sys.exit() # Next on the control bam if it exists if control_bam_file: control_name = control_bam_file.split('/')[-1] mapped_control_folder = "{}{}_{}_{}/".format( enhancer_folder, enhancer_name, gff_root_name, control_name, ) mapped_control_file = "{}{}_{}_{}/matrix.txt".format( enhancer_folder, enhancer_name, gff_root_name, control_name, ) cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format( bamliquidator_path, enhancer_gene_gff_file, mapped_control_folder, control_bam_file, ) print("Mapping control bam {}".format(control_bam_file)) print(cmd) os.system(cmd) # Check for completion if utils.check_output(mapped_control_file, 0.2, 5): print( "SUCCESSFULLY MAPPED TO {} FROM BAM: {}" "".format(enhancer_gene_gff_file, control_bam_file) ) else: print( "ERROR: FAILED TO MAP {} FROM BAM: {}" .format(enhancer_gene_gff_file, control_bam_file) ) sys.exit() # Now get the appropriate output files if control_bam_file: print( "CHECKING FOR MAPPED OUTPUT AT {} AND {}" "".format(mapped_rank_by_file, mapped_control_file) ) if (utils.check_output(mapped_rank_by_file, 1, 1) and utils.check_output(mapped_control_file, 1, 1)): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signal_dict = make_signal_dict(mapped_rank_by_file, mapped_control_file) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT {}".format(mapped_rank_by_file)) if utils.check_output(mapped_rank_by_file, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signal_dict = make_signal_dict(mapped_rank_by_file) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # Use enhancer rank to order rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list]) used_names = [] # Make a new dict to hold TSS signal by max per gene_name gene_name_sig_dict = defaultdict(list) print('MAKING GENE TABLE') for i in rank_order: ref_id = overall_gene_list[i] gene_name = start_dict[ref_id]['name'] if used_names.count(gene_name) and unique_genes: continue else: used_names.append(gene_name) prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id] super_status = max(super_dict[ref_id]) enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]]) enhancer_signal = signal_dict[ref_id] gene_name_sig_dict[gene_name].append(enhancer_signal) new_line = [ gene_name, ref_id, ','.join(prox_enhancers), enhancer_ranks, super_status, enhancer_signal ] gene_to_enhancer_table.append(new_line) print('MAKING ENHANCER TO TOP GENE TABLE') if no_format_table: enhancer_to_top_gene_table = [ enhancer_to_gene_table[0] + ['TOP_GENE', 'TSS_SIGNAL'] ] else: enhancer_to_top_gene_table = [ enhancer_to_gene_table[0][0:12] + ['TOP_GENE', 'TSS_SIGNAL'] + enhancer_to_gene_table[0][-2:] ] for line in enhancer_to_gene_table[1:]: gene_list = [] if no_format_table: gene_list += line[-3].split(',') gene_list += line[-2].split(',') else: gene_list += line[10].split(',') gene_list += line[11].split(',') gene_list = utils.uniquify([x for x in gene_list if x]) if gene_list: try: sig_vector = [max(gene_name_sig_dict[x]) for x in gene_list] max_index = sig_vector.index(max(sig_vector)) max_gene = gene_list[max_index] max_sig = sig_vector[max_index] if max_sig == 0.0: max_gene = 'NONE' max_sig = 'NONE' except ValueError: if len(gene_list) == 1: max_gene = gene_list[0] max_sig = 'NONE' else: max_gene = 'NONE' max_sig = 'NONE' else: max_gene = 'NONE' max_sig = 'NONE' if no_format_table: new_line = line + [max_gene, max_sig] else: new_line = line[0:12] + [max_gene, max_sig] + line[-2:] enhancer_to_top_gene_table.append(new_line) # Resort enhancer_to_gene_table if no_format_table: return enhancer_to_gene_table, enhancer_to_top_gene_table, gene_to_enhancer_table else: enhancer_order = utils.order( [int(line[-2]) for line in enhancer_to_gene_table[1:]] ) sorted_table = [enhancer_to_gene_table[0]] sorted_top_gene_table = [enhancer_to_top_gene_table[0]] for i in enhancer_order: sorted_table.append(enhancer_to_gene_table[(i + 1)]) sorted_top_gene_table.append(enhancer_to_top_gene_table[(i + 1)]) return sorted_table, sorted_top_gene_table, gene_to_enhancer_table
def map_enhancer_to_gene(annot_file, enhancer_file, transcribed_file=None, unique_genes=True, search_window=50000, no_format_table=False): """Maps genes to enhancers. If unique_genes, reduces to gene name only. Otherwise, gives for each refseq. """ start_dict = utils.make_start_dict(annot_file) enhancer_table = utils.parse_table(enhancer_file, '\t') if transcribed_file: transcribed_table = utils.parse_table(transcribed_file, '\t') transcribed_genes = [line[1] for line in transcribed_table] else: transcribed_genes = start_dict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribed_collection = utils.make_transcript_collection( annot_file, 0, 0, 500, transcribed_genes, ) print('MAKING TSS COLLECTION') tss_loci = [] for gene_id in transcribed_genes: tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0)) # This turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)} # Dictionaries to hold ranks and super_status of gene nearby enhancers rank_dict = defaultdict(list) super_dict = defaultdict(list) # List of all genes that appear in this analysis overall_gene_list = [] # Find the header for line in enhancer_table: if line[0][0] != '#': header = line print('This is the header') print(header) break if no_format_table: # Set up the output tables # First by enhancer enhancer_to_gene_table = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] ] else: # Set up the output tables # First by enhancer enhancer_to_gene_table = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:] ] # Next by gene gene_to_enhancer_table = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # Next make the gene to enhancer table gene_to_enhancer_table = [ [ 'GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'TSS_SIGNAL', ] ] for line in enhancer_table: if line[0][0] == '#' or line[0][0] == 'R': continue enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3]) enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # Overlapping genes are transcribed genes whose transcript is directly in the # stitched locus overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both') overlapping_genes = [] for overlap_locus in overlapping_loci: overlapping_genes.append(overlap_locus.id) # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the # stitched loci proximal_loci = tss_collection.get_overlap( utils.make_search_locus(enhancer_locus, search_window, search_window), 'both', ) proximal_genes = [] for prox_locus in proximal_loci: proximal_genes.append(prox_locus.id) distal_loci = tss_collection.get_overlap( utils.make_search_locus(enhancer_locus, 1000000, 1000000), 'both', ) distal_genes = [] for prox_locus in distal_loci: distal_genes.append(prox_locus.id) overlapping_genes = utils.uniquify(overlapping_genes) proximal_genes = utils.uniquify(proximal_genes) distal_genes = utils.uniquify(distal_genes) all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes # These checks make sure each gene list is unique # Technically it is possible for a gene to be overlapping, but not proximal since the gene # could be longer than the 50kb window, but we'll let that slide here for ref_id in overlapping_genes: if proximal_genes.count(ref_id) == 1: proximal_genes.remove(ref_id) for ref_id in proximal_genes: if distal_genes.count(ref_id) == 1: distal_genes.remove(ref_id) # Now find the closest gene if not all_enhancer_genes: closest_gene = '' else: # Get enhancer_center enhancer_center = (int(line[2]) + int(line[3])) / 2 # Get absolute distance to enhancer center dist_list = [ abs( enhancer_center - start_dict[gene_id]['start'][0] ) for gene_id in all_enhancer_genes ] # Get the ID and convert to name closest_gene = start_dict[ all_enhancer_genes[dist_list.index(min(dist_list))] ]['name'] # Now write the row for the enhancer table if no_format_table: new_enhancer_line = list(line) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) ) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) ) new_enhancer_line.append(closest_gene) else: new_enhancer_line = line[0:9] new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) ) new_enhancer_line.append( ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) ) new_enhancer_line.append(closest_gene) new_enhancer_line += line[-2:] enhancer_to_gene_table.append(new_enhancer_line) # Now grab all overlapping and proximal genes for the gene ordered table overall_gene_list += overlapping_genes for ref_id in overlapping_genes: gene_dict['overlapping'][ref_id].append(enhancer_string) rank_dict[ref_id].append(int(line[-2])) super_dict[ref_id].append(int(line[-1])) overall_gene_list += proximal_genes for ref_id in proximal_genes: gene_dict['proximal'][ref_id].append(enhancer_string) rank_dict[ref_id].append(int(line[-2])) super_dict[ref_id].append(int(line[-1])) # End loop through # Make table by gene overall_gene_list = utils.uniquify(overall_gene_list) # Use enhancer rank to order rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list]) used_names = [] for i in rank_order: ref_id = overall_gene_list[i] gene_name = start_dict[ref_id]['name'] if used_names.count(gene_name) and unique_genes: continue else: used_names.append(gene_name) prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id] super_status = max(super_dict[ref_id]) enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]]) new_line = [gene_name, ref_id, ','.join(prox_enhancers), enhancer_ranks, super_status] gene_to_enhancer_table.append(new_line) # Resort enhancer_to_gene_table if no_format_table: return enhancer_to_gene_table, gene_to_enhancer_table else: enhancer_order = utils.order([int(line[-2]) for line in enhancer_to_gene_table[1:]]) sorted_table = [enhancer_to_gene_table[0]] for i in enhancer_order: sorted_table.append(enhancer_to_gene_table[(i+1)]) return sorted_table, gene_to_enhancer_table
def region_stitching(reference_collection, name, out_folder, stitch_window, tss_window, annot_file, remove_tss=True): """Region stitching.""" print('PERFORMING REGION STITCHING') # First have to turn bound region file into a locus collection # Need to make sure this names correctly... Each region should have a unique name # reference_collection debug_output = [] # Filter out all bound regions that overlap the TSS of an ACTIVE GENE if remove_tss: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF {}BP'.format(tss_window)) # First make a locus collection of TSS start_dict = utils.make_start_dict(annot_file) # Now make_TSS loci for active genes remove_ticker = 0 # This loop makes a locus centered around +/- tss_window of transcribed genes then adds it # to the list tss_loci tss_loci = [] for gene_id in start_dict.keys(): tss_loci.append(utils.make_tss_locus(gene_id, start_dict, tss_window, tss_window)) # This turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) # Gives all the loci in reference_collection bound_loci = reference_collection.get_loci() # This loop will check if each bound region is contained by the TSS exclusion zone # This will drop out a lot of the promoter only regions that are tiny # Typical exclusion window is around 2kb for locus in bound_loci: if tss_collection.get_containers(locus, 'both'): # If true, the bound locus overlaps an active gene reference_collection.remove(locus) debug_output.append([str(locus), locus.id, 'CONTAINED']) remove_ticker += 1 print('REMOVED {} LOCI BECAUSE THEY WERE CONTAINED BY A TSS'.format(remove_ticker)) # Reference_collection is now all enriched region loci that don't overlap an active TSS if not stitch_window: print('DETERMINING OPTIMUM STITCHING PARAMTER') opt_collection = copy.deepcopy(reference_collection) stitch_window = optimize_stitching(opt_collection, name, out_folder, step_size=500) print('USING A STITCHING PARAMETER OF {}'.format(stitch_window)) stitched_collection = reference_collection.stitch_collection(stitch_window, 'both') if remove_tss: # Now replace any stitched region that overlap 2 distinct genes with the original loci # that were there fixed_loci = [] tss_loci = [] for gene_id in start_dict.keys(): tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 50, 50)) # This turns the tss_loci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tss_collection = utils.LocusCollection(tss_loci, 50) remove_ticker = 0 original_ticker = 0 for stitched_locus in stitched_collection.get_loci(): overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both') tss_names = [start_dict[tss_locus.id]['name'] for tss_locus in overlapping_tss_loci] tss_names = utils.uniquify(tss_names) if len(tss_names) > 2: # Stitched_collection.remove(stitched_locus) original_loci = reference_collection.get_overlap(stitched_locus, 'both') original_ticker += len(original_loci) fixed_loci += original_loci debug_output.append([str(stitched_locus), stitched_locus.id, 'MULTIPLE_TSS']) remove_ticker += 1 else: fixed_loci.append(stitched_locus) print( 'REMOVED {} STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs'.format(remove_ticker) ) print('ADDED BACK {} ORIGINAL LOCI'.format(original_ticker)) fixed_collection = utils.LocusCollection(fixed_loci, 50) return fixed_collection, debug_output, stitch_window else: return stitched_collection, debug_output, stitch_window
def map_collection(stitched_collection, reference_collection, bam_file_list, mapped_folder, output, ref_name): """Makes a table of factor density in a stitched locus. Also ranks table by number of loci stitched together. """ print('FORMATTING TABLE') loci = stitched_collection.get_loci() locus_table = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] loci_len_list = [] # Strip out any that are in chrY for locus in list(loci): if locus.chr == 'chrY': loci.remove(locus) for locus in loci: loci_len_list.append(locus.len()) len_order = utils.order(loci_len_list, decreasing=True) ticker = 0 for i in len_order: ticker += 1 if not ticker % 1000: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus ref_enrich_size = 0 ref_overlapping_loci = reference_collection.get_overlap(locus, 'both') for ref_locus in ref_overlapping_loci: ref_enrich_size += ref_locus.len() try: stitch_count = int(locus.id.split('_')[0]) except ValueError: stitch_count = 1 coords = [int(x) for x in locus.coords()] locus_table.append( [locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size] ) print('GETTING MAPPED DATA') print("USING A BAM FILE LIST:") print(bam_file_list) for bam_file in bam_file_list: bam_file_name = bam_file.split('/')[-1] print('GETTING MAPPING DATA FOR {}'.format(bam_file)) # Assumes standard convention for naming enriched region gffs # Opening up the mapped GFF print('OPENING {}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name)) mapped_gff = utils.parse_table( '{}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name), '\t', ) signal_dict = defaultdict(float) print('MAKING SIGNAL DICT FOR {}'.format(bam_file)) mapped_loci = [] for line in mapped_gff[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mapped_loci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signal_dict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mapped_collection = utils.LocusCollection(mapped_loci, 500) locus_table[0].append(bam_file_name) for i in range(1, len(locus_table)): signal = 0.0 line = locus_table[i] line_locus = utils.Locus(line[1], line[2], line[3], '.') overlapping_regions = mapped_collection.get_overlap(line_locus, sense='both') for region in overlapping_regions: signal += signal_dict[region.id] locus_table[i].append(signal) utils.unparse_table(locus_table, output, '\t')
def rose(input_file, rankby, output_folder, genome, bams=None, control='', stitch=None, tss=0, mask_file=None): """ROSE2 main function.""" debug = False # Making the out folder if it doesn't exist out_folder = utils.format_folder(output_folder, True) # Figuring out folder schema gff_folder = utils.format_folder(out_folder + 'gff/', True) mapped_folder = utils.format_folder(out_folder + 'mappedGFF/', True) # Getting input file if input_file.split('.')[-1] == 'bed': # Converting a BED file input_gff_name = input_file.split('/')[-1][0:-4] input_gff_file = '{}{}.gff'.format(gff_folder, input_gff_name) utils.bed_to_gff(input_file, input_gff_file) elif input_file.split('.')[-1] == 'gff': # Copy the input GFF to the GFF folder input_gff_file = input_file os.system('cp {} {}'.format(input_gff_file, gff_folder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # Copy the input GFF to the GFF folder input_gff_file = input_file os.system('cp {} {}'.format(input_gff_file, gff_folder)) # Getting the list of BAM files to process bam_file_list = [rankby] if control: bam_file_list.append(control) if bams: bam_file_list += bams.split(',') for bam in bam_file_list: if not os.path.isfile('{}.bai'.format(bam)): print('INDEX FILE FOR {} IS MISSING'.format(bam)) sys.exit() # Optional args # Stitch parameter stitch_window = '' if not stitch else int(stitch) # TSS options tss_window = int(tss) remove_tss = True if not tss_window else False # Getting the Bound region file used to define enhancers print('USING {} AS THE INPUT GFF'.format(input_gff_file)) input_name = input_gff_file.split('/')[-1].split('.')[0] # Getting the genome print('USING {} AS THE GENOME'.format(genome)) # Getting the correct annot file annotation_path = '{}/annotation'.format(ROOT_DIR) genome_dict = { 'HG18': '{}/hg18_refseq.ucsc'.format(annotation_path), 'MM9': '{}/mm9_refseq.ucsc'.format(annotation_path), 'HG19': '{}/hg19_refseq.ucsc'.format(annotation_path), 'MM8': '{}/mm8_refseq.ucsc'.format(annotation_path), 'MM10': '{}/mm10_refseq.ucsc'.format(annotation_path), 'RN4': '{}/rn4_refseq.ucsc'.format(annotation_path), 'RN6': '{}/rn6_refseq.ucsc'.format(annotation_path), } annot_file = genome_dict[genome.upper()] # Get chroms found in the bams print('GETTING CHROMS IN BAMFILES') bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) print("USING THE FOLLOWING CHROMS") print(bam_chrom_list) # Loading in the GFF and filtering by chrom print('LOADING AND FILTERING THE GFF') input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list) # Loading in the bound region reference collection print('LOADING IN GFF REGIONS') reference_collection = utils.gff_to_locus_collection(input_gff) print('STARTING WITH {} INPUT REGIONS'.format(len(reference_collection))) print('CHECKING REFERENCE COLLECTION:') rose2_utils.check_ref_collection(reference_collection) # Masking reference collection # See if there's a mask if mask_file: print('USING MASK FILE {}'.format(mask_file)) # if it's a bed file if mask_file.split('.')[-1].upper() == 'BED': mask_gff = utils.bed_to_gff(mask_file) elif mask_file.split('.')[-1].upper() == 'GFF': mask_gff = utils.parse_table(mask_file, '\t') else: print("MASK MUST BE A .gff or .bed FILE") mask_collection = utils.gff_to_locus_collection(mask_gff) print('LOADING {} MASK REGIONS'.format(len(mask_collection))) # Now mask the reference loci reference_loci = reference_collection.get_loci() filtered_loci = [ locus for locus in reference_loci if not mask_collection.get_overlap(locus, 'both') ] print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}" "".format(len(reference_loci) - len(filtered_loci), mask_file)) reference_collection = utils.LocusCollection(filtered_loci, 50) # Now stitch regions print('STITCHING REGIONS TOGETHER') stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( reference_collection, input_name, out_folder, stitch_window, tss_window, annot_file, remove_tss, ) # Now make a stitched collection GFF print('MAKING GFF FROM STITCHED COLLECTION') stitched_gff = utils.locus_collection_to_gff(stitched_collection) # Making sure start/stop ordering are correct for line in stitched_gff: start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitch_window) print(type(stitch_window)) if not remove_tss: stitched_gff_file = '{}{}_{}KB_STITCHED.gff'.format( gff_folder, input_name, str(stitch_window / 1000), ) stitched_gff_name = '{}_{}KB_STITCHED'.format( input_name, str(stitch_window / 1000)) debug_out_file = '{}{}_{}KB_STITCHED.debug'.format( gff_folder, input_name, str(stitch_window / 1000), ) else: stitched_gff_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.gff'.format( gff_folder, input_name, str(stitch_window / 1000), ) stitched_gff_name = '{}_{}KB_STITCHED_TSS_DISTAL'.format( input_name, str(stitch_window / 1000), ) debug_out_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.debug'.format( gff_folder, input_name, str(stitch_window / 1000), ) # Writing debug output to disk if debug: print('WRITING DEBUG OUTPUT TO DISK AS {}'.format(debug_out_file)) utils.unparse_table(debug_output, debug_out_file, '\t') # Write the GFF to disk print('WRITING STITCHED GFF TO DISK AS {}'.format(stitched_gff_file)) utils.unparse_table(stitched_gff, stitched_gff_file, '\t') # Setting up the overall output file output_file_1 = out_folder + stitched_gff_name + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO {}'.format(output_file_1)) # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local # (in path), otherwise fail bamliquidator_path = 'bamliquidator_batch' bam_file_list_unique = list(bam_file_list) bam_file_list_unique = utils.uniquify(bam_file_list_unique) # Prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bam_file_list_unique) for bam_file in bam_file_list_unique: bam_file_name = bam_file.split('/')[-1] # Mapping to the stitched GFF mapped_out_1_folder = '{}{}_{}_MAPPED'.format( mapped_folder, stitched_gff_name, bam_file_name, ) mapped_out_1_file = '{}{}_{}_MAPPED/matrix.txt'.format( mapped_folder, stitched_gff_name, bam_file_name, ) if utils.check_output(mapped_out_1_file, 0.2, 0.2): print("FOUND {} MAPPING DATA FOR BAM: {}".format( stitched_gff_file, mapped_out_1_file)) else: cmd1 = "{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( bamliquidator_path, stitched_gff_file, mapped_out_1_folder, bam_file, ) print(cmd1) os.system(cmd1) if utils.check_output(mapped_out_1_file, 0.2, 5): print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}" "".format(stitched_gff_file, bam_file_name)) else: print("ERROR: FAILED TO MAP {} FROM BAM: {}" "".format(stitched_gff_file, bam_file_name)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # Calculate density by region # TODO: Need to fix this function to account for different outputs of liquidator rose2_utils.map_collection( stitched_collection, reference_collection, bam_file_list, mapped_folder, output_file_1, ref_name=stitched_gff_name, ) print('CALLING AND PLOTTING SUPER-ENHANCERS') control_name = control.split('/')[-1] if control else 'NONE' cmd = 'ROSE2_callSuper.R {} {} {} {}'.format( out_folder, output_file_1, input_name, control_name, ) print(cmd) os.system(cmd) # Calling the gene mapper time.sleep(5) tables = [ "_SuperEnhancers.table.txt", "_StretchEnhancers.table.txt", "_SuperStretchEnhancers.table.txt", ] for table in tables: table_file = "{}{}".format(input_name, table) genemapper.map( os.path.join(out_folder, table_file), genome, rankby, control, )