Ejemplo n.º 1
0
def find_valleys(gene_to_enhancer_dict,
                 bam_file_list,
                 project_name,
                 project_folder,
                 cutoff=0.2):
    """Returns a dictionary of refseqs with all valley loci that are associated.

    Returns 2 kinds of bed files. 1 = all

    """
    # First make the bamDict
    all_valley_bed = []
    valley_dict = {}

    # Start w/ a bam_file_list and make a list of bam type objects
    bam_list = [utils.Bam(bam_path) for bam_path in bam_file_list]
    max_read_length = max([bam.get_read_lengths()[0] for bam in bam_list])

    gene_list = list(gene_to_enhancer_dict.keys())
    gene_list.sort()
    ticker = 0
    print("number of regions processed:")
    for gene in gene_list:

        valley_dict[gene] = []

        for region in gene_to_enhancer_dict[gene]:
            if ticker % 100 == 0:
                print(ticker)
            ticker += 1
            score_array = score_valley(
                region,
                bam_list,
                max_read_length,
            )
            for index, score in enumerate(score_array):
                if score > cutoff:
                    valley = utils.Locus(
                        region.chr,
                        region.start + index * 10,
                        region.start + (index + 1) * 10,
                        ".",
                    )
                    valley_dict[gene].append(valley)

        stitched_valleys = stitch_valleys(valley_dict[gene])
        for valley in stitched_valleys:
            all_valley_bed.append([valley.chr, valley.start, valley.end])
            valley_dict[gene] = stitched_valleys

    all_bed_path = project_folder + project_name + "_all_valleys.bed"
    utils.unparse_table(all_valley_bed, all_bed_path, "\t")

    return all_bed_path
Ejemplo n.º 2
0
def filter_subpeaks(subpeak_file, analysis_name, output_folder):
    """Takes the initial subpeaks in, stitches them."""
    # Stitch the subpeaks
    print(subpeak_file)
    subpeak_collection = utils.import_bound_region(
        subpeak_file, '%s_subpeak' % (analysis_name))

    subpeak_collection = subpeak_collection.stitch_collection()

    subpeak_loci = subpeak_collection.get_loci()

    all_sub_bed = []
    for locus in subpeak_loci:
        bed_line = [locus.chr, locus.start, locus.end, '.', locus.id]
        all_sub_bed.append(bed_line)

    all_bed_path = output_folder + analysis_name + '_all_subpeak.bed'
    utils.unparse_table(all_sub_bed, all_bed_path, '\t')

    return all_bed_path
Ejemplo n.º 3
0
def format_network_output(graph, output_folder, analysis_name):
    """Takes the networkx graph and returns all figures, tables, etc."""

    # Output the network as a .ntx dictionary of lists
    network_filename = output_folder + analysis_name + '.ntx'
    with open(network_filename, 'wb') as network_file:
        network_dict_of_lists = nx.to_dict_of_lists(graph)
        pickle.dump(network_dict_of_lists, network_file)

    # Output the adjacency list and nodelist
    node_file = output_folder + analysis_name + '_NODELIST.txt'
    if nx.__version__[0] == '1':
        node_list = [[n] for n in graph.nodes_iter()]
    elif nx.__version__[0] == '2':
        node_list = [[n] for n in graph.nodes()]
    else:
        print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE')
        sys.exit()
    utils.unparse_table(node_list, node_file, '\t')

    adj_file = output_folder + analysis_name + '_ADJ_LIST.txt'

    if nx.__version__[0] == '1':
        adj_list = graph.adjacency_list()
    elif nx.__version__[0] == '2':
        adj_list = [list(n[1].keys()) for n in graph.adjacency()]
    else:
        print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE')
        sys.exit()

    utils.unparse_table(adj_list, adj_file, '\t')

    edges_table = [['From', 'To']]
    for i, gene in enumerate(node_list):
        for j in adj_list[i]:
            newline = [gene[0], j]
            edges_table.append(newline)

    edge_file = output_folder + analysis_name + '_EDGE_LIST.txt'
    utils.unparse_table(edges_table, edge_file, '\t')

    # Make the degree table
    deg_table = [['Tf', 'In_Degree', 'Out_Degree', 'Total_Connections']]
    deg_file = output_folder + analysis_name + '_DEGREE_TABLE.txt'

    # Shouldn't we output the table for the TFs that have motifs only?
    # For canidateMotifs in graph.nodes()....
    for node in graph.nodes():
        newline = [
            node,
            graph.in_degree()[node],
            graph.out_degree()[node],
            graph.degree()[node]
        ]
        deg_table.append(newline)

    utils.unparse_table(deg_table, deg_file, '\t')

    print('DEFINING THE CORE REGULATORY CIRCUIT')

    autoreg = graph.selfloop_edges()
    self_loops = [x for x, y in autoreg]
    self_loop_file = output_folder + analysis_name + '_SELF_LOOPS.txt'
    utils.unparse_table(self_loops, self_loop_file, '')

    un_dir_graph = nx.from_edgelist(pairs(self_loops, graph))
    clique_gen = find_cliques_recursive(un_dir_graph)
    out_degree_dict = graph.out_degree()

    clique_ranking = get_clique_ranking(clique_gen, out_degree_dict)

    factor_enrichment_dict = {}
    for factor in self_loops:
        factor_enrichment_dict[factor] = 0

    clique_len = 0
    top_cliques = []
    min_clique = ()
    for clique, score in clique_ranking:
        clique_len += 1
        for factor in clique:
            factor_enrichment_dict[factor] += 1

        # Get top 100 cliques
        if clique_len <= 100:
            top_cliques.append((clique, score))
            continue

        if not min_clique:
            min_clique = min(top_cliques, key=lambda x: x[1])

        if score > min_clique[1]:
            top_cliques.remove(min_clique)
            top_cliques.append((clique, score))
            min_clique = min(top_cliques, key=lambda x: x[1])

    top_cliques.sort(reverse=True, key=lambda x: x[1])
    clique_file = output_folder + analysis_name + '_CLIQUE_SCORES_DEGREE.txt'
    utils.unparse_table(top_cliques, clique_file, '\t')

    factor_ranking_table = []
    for factor in self_loops:
        newline = [factor, factor_enrichment_dict[factor] / float(clique_len)]
        factor_ranking_table.append(newline)

    factor_ranking_file = output_folder + analysis_name + '_ENRICHED_CLIQUE_FACTORS.txt'
    utils.unparse_table(factor_ranking_table, factor_ranking_file, '\t')
Ejemplo n.º 4
0
def build_graph(edge_dict,
                gene_to_enhancer_dict,
                output_folder,
                analysis_name,
                cutoff=1):
    """Build a target graph from the collapsed edge dictionary.

    Require at least n motifs to constitute an edge where n is set by cutoff.
    Default is 1.

    """
    node_list = list(edge_dict.keys())
    node_list.sort()

    # This is only edges between TFs
    graph = nx.DiGraph(name=analysis_name)
    graph.add_nodes_from(node_list)

    # This stores ALL edges identified by motifs
    edge_table = [[
        'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID',
        'TF_INTERACTION'
    ]]
    edge_output = '{}{}_EDGE_TABLE.txt'.format(output_folder, analysis_name)

    for source in node_list:
        print(source)
        target_list = list(edge_dict[source].keys())
        target_list.sort()
        for target in target_list:

            # Now we need to see which target regions this guy overlaps
            target_regions = gene_to_enhancer_dict[target]
            target_collection = utils.LocusCollection(target_regions, 50)

            # Get the edges hitting that target
            edge_loci = edge_dict[source][target]
            if node_list.count(target) > 0:
                tf_interaction = 1
            else:
                tf_interaction = 0
            # Only add to the graph if this is a TF/TF interaction
            if len(edge_loci) >= cutoff and node_list.count(target) > 0:
                graph.add_edge(source, target)

            # Now for each edge, add to the table
            for edge_locus in edge_loci:
                region_string = ','.join([
                    locus.id
                    for locus in target_collection.get_overlap(edge_locus)
                ])
                edge_line = [
                    source,
                    target,
                    edge_locus.chr,
                    edge_locus.start,
                    edge_locus.end,
                    region_string,
                    tf_interaction,
                ]
                edge_table.append(edge_line)

    utils.unparse_table(edge_table, edge_output, '\t')
    return graph
Ejemplo n.º 5
0
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name,
                  motif_convert_file):
    """Collapses motifs from fimo.

    For each source node (TF) and each target node (gene enhancer regions), collapse motif
    instances then spit out a ginormous set of beds and a single crazy collapsed bed.

    """
    # First build up the motif name conversion database
    motif_database = utils.parse_table(motif_convert_file, '\t')
    motif_database_dict = defaultdict(list)

    # The reverse of the other dict, from motif name to gene name
    # A motif can go to multiple genes
    for line in motif_database:
        motif_database_dict[line[0]].append(line[1])

    # Make the folder to store motif beds
    utils.format_folder('{}motif_beds/'.format(output_folder), True)

    edge_dict = {}

    # First layer are source nodes
    for tf in candidate_tf_list:
        edge_dict[tf] = defaultdict(list)
    # Next layer are target nodes which are derived from the fimo output

    fimo_table = utils.parse_table(fimo_output, '\t')
    print(fimo_output)

    # fimo sometimes puts the region in either the first or second column
    fimo_line = fimo_table[1]
    if fimo_line[1].count('|') > 0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index))

    for line in fimo_table[1:]:
        source_tfs = motif_database_dict[line[0]]  # motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[3]),
                                           int(region[2]) + int(line[4]), '.')
            else:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[2]),
                                           int(region[2]) + int(line[3]), '.')

            # What's missing here is the enhancer id of the target locus
            try:
                edge_dict[source][target].append(target_locus)
            except KeyError:
                print('This motif is not in the network')
                print(line)
                sys.exit()

    # Now we actually want to collapse this down in a meaningful way
    # Overlapping motifs count as a single binding site. This way a TF with tons of motifs
    # that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edge_dict[tf].keys()
        bed_header = [
            'track name = "{}" description="{} motifs in {}"'.format(
                tf, tf, analysis_name)
        ]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '{}motif_beds/{}_motifs.bed'.format(
            output_folder, tf)
        for target in target_nodes:
            edge_collection = utils.LocusCollection(edge_dict[tf][target], 50)
            edge_collection = edge_collection.stitch_collection()
            edge_loci = edge_collection.get_loci()
            edge_dict[tf][target] = edge_loci
            for locus in edge_loci:
                bed_line = [locus.chr, locus.start, locus.end, target, '', '+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unparse_table(target_bed, target_bed_path, '\t')
    # Now the loci are all stitched up
    utils.unparse_table(all_bed, all_bed_path, '\t')
    return edge_dict
Ejemplo n.º 6
0
def crc(enhancers, genome_input, chrom_path, output, analysis_name, bam=None, subpeak_file=None,
        mask_file=None, activity_path=None, const_extension=100, number=1, motifs=False, tfs='',
        config=''):
    """CRC main function."""
    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    genome = crc_utils.load_genome(
        genome_input,
        chrom_path,
        mask_file=mask_file,
        config_file=config,
    )

    motif_database_file = genome.return_feature('motif_database')
    motif_convert_file = genome.return_feature('motif_convert')

    # User input files
    enhancer_file = enhancers

    if bam is None and subpeak_file is None:
        print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed')
        sys.exit()

    # Will need to fix bams down the line to take in multiple bams
    if bam:
        bam_file_list = [bam_path for bam_path in bam.split(',') if bam_path]
        print(bam_file_list)
    else:
        bam_file_list = []

    # Output folder and analysis name
    print(output)
    output_folder = utils.format_folder(output, True)

    print(
        '\n\n#======================================\n#===========I. DATA SUMMARY============\n#='
        '=====================================\n'
    )

    print('Analyzing TF connectivity for {}'.format(analysis_name))
    print('Writing output to {}'.format(output_folder))
    if subpeak_file:
        print('Using {} to define subpeaks for motif finding'.format(subpeak_file))
    else:
        print('Identifying valleys from .bam files')
    print('Using {} to define active genes'.format(activity_path))

    # =====================================================================================
    # =======================II. IDENTIFYING CANDIDATE TFS AND NODES=======================
    # =====================================================================================

    print(
        '\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#='
        '=====================================\n'
    )

    (
        gene_region_table,
        gene_tf_region_table,
        enhancer_region_table,
        enhancer_tf_region_table,
        gene_summary_table,
        candidate_tf_list,
        gene_to_enhancer_dict,
    ) = crc_utils.gene_to_enhancer(genome, enhancer_file, activity_path)

    # Write these guys to disk
    gene_out = '{}{}_GENE_TABLE.txt'.format(output_folder, analysis_name)
    gene_tf_out = '{}{}_GENE_TF_TABLE.txt'.format(output_folder, analysis_name)

    enhancer_out = '{}{}_ENHANCER_TABLE.txt'.format(output_folder, analysis_name)
    enhancer_tf_out = '{}{}_ENHANCER_TF_TABLE.txt'.format(output_folder, analysis_name)

    summary_out = '{}{}_GENE_SUMMARY.txt'.format(output_folder, analysis_name)

    utils.unparse_table(enhancer_region_table, enhancer_out, '\t')
    utils.unparse_table(enhancer_tf_region_table, enhancer_tf_out, '\t')

    utils.unparse_table(gene_region_table, gene_out, '\t')
    utils.unparse_table(gene_tf_region_table, gene_tf_out, '\t')

    utils.unparse_table(gene_summary_table, summary_out, '\t')

    print(
        'Identified {} genes w/ proximal cis-regulatory elements'
        ''.format(len(gene_to_enhancer_dict))
    )

    print('Identified {} candidate TFs'.format(len(candidate_tf_list)))
    print(candidate_tf_list)

    # =====================================================================================
    # ==========================III. FINDING VALLEYS/SUBPEAKS==============================
    # =====================================================================================

    print(
        '\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#='
        '=====================================\n'
    )

    # So here we would need to find valleys everywhere
    if subpeak_file is None:
        print('finding valleys')
        # Note: the tf_bed_path is for networks, all is for out degree finding
        all_bed_path = crc_utils.find_valleys(
            gene_to_enhancer_dict, bam_file_list, analysis_name, output_folder, cutoff=0.2
        )
    else:
        print('Using subpeaks from {}'.format(subpeak_file))
        all_bed_path = crc_utils.filter_subpeaks(
            subpeak_file, analysis_name, output_folder
        )

    # First make the subpeak bed and subpeak fasta for the tfs
    all_sub_bed, all_fasta = crc_utils.generate_subpeak_fasta(
        gene_to_enhancer_dict, all_bed_path, genome, analysis_name, const_extension
    )
    if subpeak_file is None:
        # This is the case where we did valleys # only reason you would need to output the sub bed
        all_sub_out = '{}{}_all_subpeak.bed'.format(output_folder, analysis_name)
        utils.unparse_table(all_sub_bed, all_sub_out, '\t')

    # Writing the all subpeak fasta out to disk
    all_fasta_out = '{}{}_all_subpeak.fasta'.format(output_folder, analysis_name)
    utils.unparse_table(all_fasta, all_fasta_out, '')

    # =====================================================================================
    # =================================IV. FINDING MOTIFS==================================
    # =====================================================================================

    print(
        '\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#='
        '=====================================\n'
    )

    # First make background
    bg_path = crc_utils.make_motif_background(all_fasta_out, output_folder, analysis_name)

    # Find motifs for all regions
    fimo_out = crc_utils.find_motifs(
        all_fasta_out,
        bg_path,
        candidate_tf_list,
        output_folder,
        analysis_name,
        motif_convert_file,
        motif_database_file,
    )

    edge_dict = crc_utils.collapse_fimo(
        fimo_out,
        candidate_tf_list,
        output_folder,
        analysis_name,
        motif_convert_file,
    )

    # =====================================================================================
    # ============================V. RUNNING NETWORK ANALYSIS==============================
    # =====================================================================================

    print(
        '\n\n#======================================\n#========V. BUILDING NETWORK===========\n#='
        '=====================================\n'
    )

    print('building graph and edge table')
    graph = crc_utils.build_graph(
        edge_dict,
        gene_to_enhancer_dict,
        output_folder,
        analysis_name,
        cutoff=1,
    )

    crc_utils.format_network_output(graph, output_folder, analysis_name)

    print('FINISHED RUNNING CRC FOR {}'.format(analysis_name))