def find_valleys(gene_to_enhancer_dict, bam_file_list, project_name, project_folder, cutoff=0.2): """Returns a dictionary of refseqs with all valley loci that are associated. Returns 2 kinds of bed files. 1 = all """ # First make the bamDict all_valley_bed = [] valley_dict = {} # Start w/ a bam_file_list and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bam_file_list] max_read_length = max([bam.get_read_lengths()[0] for bam in bam_list]) gene_list = list(gene_to_enhancer_dict.keys()) gene_list.sort() ticker = 0 print("number of regions processed:") for gene in gene_list: valley_dict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker % 100 == 0: print(ticker) ticker += 1 score_array = score_valley( region, bam_list, max_read_length, ) for index, score in enumerate(score_array): if score > cutoff: valley = utils.Locus( region.chr, region.start + index * 10, region.start + (index + 1) * 10, ".", ) valley_dict[gene].append(valley) stitched_valleys = stitch_valleys(valley_dict[gene]) for valley in stitched_valleys: all_valley_bed.append([valley.chr, valley.start, valley.end]) valley_dict[gene] = stitched_valleys all_bed_path = project_folder + project_name + "_all_valleys.bed" utils.unparse_table(all_valley_bed, all_bed_path, "\t") return all_bed_path
def filter_subpeaks(subpeak_file, analysis_name, output_folder): """Takes the initial subpeaks in, stitches them.""" # Stitch the subpeaks print(subpeak_file) subpeak_collection = utils.import_bound_region( subpeak_file, '%s_subpeak' % (analysis_name)) subpeak_collection = subpeak_collection.stitch_collection() subpeak_loci = subpeak_collection.get_loci() all_sub_bed = [] for locus in subpeak_loci: bed_line = [locus.chr, locus.start, locus.end, '.', locus.id] all_sub_bed.append(bed_line) all_bed_path = output_folder + analysis_name + '_all_subpeak.bed' utils.unparse_table(all_sub_bed, all_bed_path, '\t') return all_bed_path
def format_network_output(graph, output_folder, analysis_name): """Takes the networkx graph and returns all figures, tables, etc.""" # Output the network as a .ntx dictionary of lists network_filename = output_folder + analysis_name + '.ntx' with open(network_filename, 'wb') as network_file: network_dict_of_lists = nx.to_dict_of_lists(graph) pickle.dump(network_dict_of_lists, network_file) # Output the adjacency list and nodelist node_file = output_folder + analysis_name + '_NODELIST.txt' if nx.__version__[0] == '1': node_list = [[n] for n in graph.nodes_iter()] elif nx.__version__[0] == '2': node_list = [[n] for n in graph.nodes()] else: print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE') sys.exit() utils.unparse_table(node_list, node_file, '\t') adj_file = output_folder + analysis_name + '_ADJ_LIST.txt' if nx.__version__[0] == '1': adj_list = graph.adjacency_list() elif nx.__version__[0] == '2': adj_list = [list(n[1].keys()) for n in graph.adjacency()] else: print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE') sys.exit() utils.unparse_table(adj_list, adj_file, '\t') edges_table = [['From', 'To']] for i, gene in enumerate(node_list): for j in adj_list[i]: newline = [gene[0], j] edges_table.append(newline) edge_file = output_folder + analysis_name + '_EDGE_LIST.txt' utils.unparse_table(edges_table, edge_file, '\t') # Make the degree table deg_table = [['Tf', 'In_Degree', 'Out_Degree', 'Total_Connections']] deg_file = output_folder + analysis_name + '_DEGREE_TABLE.txt' # Shouldn't we output the table for the TFs that have motifs only? # For canidateMotifs in graph.nodes().... for node in graph.nodes(): newline = [ node, graph.in_degree()[node], graph.out_degree()[node], graph.degree()[node] ] deg_table.append(newline) utils.unparse_table(deg_table, deg_file, '\t') print('DEFINING THE CORE REGULATORY CIRCUIT') autoreg = graph.selfloop_edges() self_loops = [x for x, y in autoreg] self_loop_file = output_folder + analysis_name + '_SELF_LOOPS.txt' utils.unparse_table(self_loops, self_loop_file, '') un_dir_graph = nx.from_edgelist(pairs(self_loops, graph)) clique_gen = find_cliques_recursive(un_dir_graph) out_degree_dict = graph.out_degree() clique_ranking = get_clique_ranking(clique_gen, out_degree_dict) factor_enrichment_dict = {} for factor in self_loops: factor_enrichment_dict[factor] = 0 clique_len = 0 top_cliques = [] min_clique = () for clique, score in clique_ranking: clique_len += 1 for factor in clique: factor_enrichment_dict[factor] += 1 # Get top 100 cliques if clique_len <= 100: top_cliques.append((clique, score)) continue if not min_clique: min_clique = min(top_cliques, key=lambda x: x[1]) if score > min_clique[1]: top_cliques.remove(min_clique) top_cliques.append((clique, score)) min_clique = min(top_cliques, key=lambda x: x[1]) top_cliques.sort(reverse=True, key=lambda x: x[1]) clique_file = output_folder + analysis_name + '_CLIQUE_SCORES_DEGREE.txt' utils.unparse_table(top_cliques, clique_file, '\t') factor_ranking_table = [] for factor in self_loops: newline = [factor, factor_enrichment_dict[factor] / float(clique_len)] factor_ranking_table.append(newline) factor_ranking_file = output_folder + analysis_name + '_ENRICHED_CLIQUE_FACTORS.txt' utils.unparse_table(factor_ranking_table, factor_ranking_file, '\t')
def build_graph(edge_dict, gene_to_enhancer_dict, output_folder, analysis_name, cutoff=1): """Build a target graph from the collapsed edge dictionary. Require at least n motifs to constitute an edge where n is set by cutoff. Default is 1. """ node_list = list(edge_dict.keys()) node_list.sort() # This is only edges between TFs graph = nx.DiGraph(name=analysis_name) graph.add_nodes_from(node_list) # This stores ALL edges identified by motifs edge_table = [[ 'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID', 'TF_INTERACTION' ]] edge_output = '{}{}_EDGE_TABLE.txt'.format(output_folder, analysis_name) for source in node_list: print(source) target_list = list(edge_dict[source].keys()) target_list.sort() for target in target_list: # Now we need to see which target regions this guy overlaps target_regions = gene_to_enhancer_dict[target] target_collection = utils.LocusCollection(target_regions, 50) # Get the edges hitting that target edge_loci = edge_dict[source][target] if node_list.count(target) > 0: tf_interaction = 1 else: tf_interaction = 0 # Only add to the graph if this is a TF/TF interaction if len(edge_loci) >= cutoff and node_list.count(target) > 0: graph.add_edge(source, target) # Now for each edge, add to the table for edge_locus in edge_loci: region_string = ','.join([ locus.id for locus in target_collection.get_overlap(edge_locus) ]) edge_line = [ source, target, edge_locus.chr, edge_locus.start, edge_locus.end, region_string, tf_interaction, ] edge_table.append(edge_line) utils.unparse_table(edge_table, edge_output, '\t') return graph
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name, motif_convert_file): """Collapses motifs from fimo. For each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed. """ # First build up the motif name conversion database motif_database = utils.parse_table(motif_convert_file, '\t') motif_database_dict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # A motif can go to multiple genes for line in motif_database: motif_database_dict[line[0]].append(line[1]) # Make the folder to store motif beds utils.format_folder('{}motif_beds/'.format(output_folder), True) edge_dict = {} # First layer are source nodes for tf in candidate_tf_list: edge_dict[tf] = defaultdict(list) # Next layer are target nodes which are derived from the fimo output fimo_table = utils.parse_table(fimo_output, '\t') print(fimo_output) # fimo sometimes puts the region in either the first or second column fimo_line = fimo_table[1] if fimo_line[1].count('|') > 0: region_index = 1 else: region_index = 2 print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index)) for line in fimo_table[1:]: source_tfs = motif_database_dict[line[0]] # motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]), '.') else: target_locus = utils.Locus(region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]), '.') # What's missing here is the enhancer id of the target locus try: edge_dict[source][target].append(target_locus) except KeyError: print('This motif is not in the network') print(line) sys.exit() # Now we actually want to collapse this down in a meaningful way # Overlapping motifs count as a single binding site. This way a TF with tons of motifs # that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edge_dict[tf].keys() bed_header = [ 'track name = "{}" description="{} motifs in {}"'.format( tf, tf, analysis_name) ] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '{}motif_beds/{}_motifs.bed'.format( output_folder, tf) for target in target_nodes: edge_collection = utils.LocusCollection(edge_dict[tf][target], 50) edge_collection = edge_collection.stitch_collection() edge_loci = edge_collection.get_loci() edge_dict[tf][target] = edge_loci for locus in edge_loci: bed_line = [locus.chr, locus.start, locus.end, target, '', '+'] target_bed.append(bed_line) all_bed.append(bed_line) utils.unparse_table(target_bed, target_bed_path, '\t') # Now the loci are all stitched up utils.unparse_table(all_bed, all_bed_path, '\t') return edge_dict
def crc(enhancers, genome_input, chrom_path, output, analysis_name, bam=None, subpeak_file=None, mask_file=None, activity_path=None, const_extension=100, number=1, motifs=False, tfs='', config=''): """CRC main function.""" # ===================================================================================== # ===============================I. PARSING ARGUMENTS================================== # ===================================================================================== genome = crc_utils.load_genome( genome_input, chrom_path, mask_file=mask_file, config_file=config, ) motif_database_file = genome.return_feature('motif_database') motif_convert_file = genome.return_feature('motif_convert') # User input files enhancer_file = enhancers if bam is None and subpeak_file is None: print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed') sys.exit() # Will need to fix bams down the line to take in multiple bams if bam: bam_file_list = [bam_path for bam_path in bam.split(',') if bam_path] print(bam_file_list) else: bam_file_list = [] # Output folder and analysis name print(output) output_folder = utils.format_folder(output, True) print( '\n\n#======================================\n#===========I. DATA SUMMARY============\n#=' '=====================================\n' ) print('Analyzing TF connectivity for {}'.format(analysis_name)) print('Writing output to {}'.format(output_folder)) if subpeak_file: print('Using {} to define subpeaks for motif finding'.format(subpeak_file)) else: print('Identifying valleys from .bam files') print('Using {} to define active genes'.format(activity_path)) # ===================================================================================== # =======================II. IDENTIFYING CANDIDATE TFS AND NODES======================= # ===================================================================================== print( '\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#=' '=====================================\n' ) ( gene_region_table, gene_tf_region_table, enhancer_region_table, enhancer_tf_region_table, gene_summary_table, candidate_tf_list, gene_to_enhancer_dict, ) = crc_utils.gene_to_enhancer(genome, enhancer_file, activity_path) # Write these guys to disk gene_out = '{}{}_GENE_TABLE.txt'.format(output_folder, analysis_name) gene_tf_out = '{}{}_GENE_TF_TABLE.txt'.format(output_folder, analysis_name) enhancer_out = '{}{}_ENHANCER_TABLE.txt'.format(output_folder, analysis_name) enhancer_tf_out = '{}{}_ENHANCER_TF_TABLE.txt'.format(output_folder, analysis_name) summary_out = '{}{}_GENE_SUMMARY.txt'.format(output_folder, analysis_name) utils.unparse_table(enhancer_region_table, enhancer_out, '\t') utils.unparse_table(enhancer_tf_region_table, enhancer_tf_out, '\t') utils.unparse_table(gene_region_table, gene_out, '\t') utils.unparse_table(gene_tf_region_table, gene_tf_out, '\t') utils.unparse_table(gene_summary_table, summary_out, '\t') print( 'Identified {} genes w/ proximal cis-regulatory elements' ''.format(len(gene_to_enhancer_dict)) ) print('Identified {} candidate TFs'.format(len(candidate_tf_list))) print(candidate_tf_list) # ===================================================================================== # ==========================III. FINDING VALLEYS/SUBPEAKS============================== # ===================================================================================== print( '\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#=' '=====================================\n' ) # So here we would need to find valleys everywhere if subpeak_file is None: print('finding valleys') # Note: the tf_bed_path is for networks, all is for out degree finding all_bed_path = crc_utils.find_valleys( gene_to_enhancer_dict, bam_file_list, analysis_name, output_folder, cutoff=0.2 ) else: print('Using subpeaks from {}'.format(subpeak_file)) all_bed_path = crc_utils.filter_subpeaks( subpeak_file, analysis_name, output_folder ) # First make the subpeak bed and subpeak fasta for the tfs all_sub_bed, all_fasta = crc_utils.generate_subpeak_fasta( gene_to_enhancer_dict, all_bed_path, genome, analysis_name, const_extension ) if subpeak_file is None: # This is the case where we did valleys # only reason you would need to output the sub bed all_sub_out = '{}{}_all_subpeak.bed'.format(output_folder, analysis_name) utils.unparse_table(all_sub_bed, all_sub_out, '\t') # Writing the all subpeak fasta out to disk all_fasta_out = '{}{}_all_subpeak.fasta'.format(output_folder, analysis_name) utils.unparse_table(all_fasta, all_fasta_out, '') # ===================================================================================== # =================================IV. FINDING MOTIFS================================== # ===================================================================================== print( '\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#=' '=====================================\n' ) # First make background bg_path = crc_utils.make_motif_background(all_fasta_out, output_folder, analysis_name) # Find motifs for all regions fimo_out = crc_utils.find_motifs( all_fasta_out, bg_path, candidate_tf_list, output_folder, analysis_name, motif_convert_file, motif_database_file, ) edge_dict = crc_utils.collapse_fimo( fimo_out, candidate_tf_list, output_folder, analysis_name, motif_convert_file, ) # ===================================================================================== # ============================V. RUNNING NETWORK ANALYSIS============================== # ===================================================================================== print( '\n\n#======================================\n#========V. BUILDING NETWORK===========\n#=' '=====================================\n' ) print('building graph and edge table') graph = crc_utils.build_graph( edge_dict, gene_to_enhancer_dict, output_folder, analysis_name, cutoff=1, ) crc_utils.format_network_output(graph, output_folder, analysis_name) print('FINISHED RUNNING CRC FOR {}'.format(analysis_name))