Beispiel #1
0
def find_motifs(subpeak_fasta, bg_path, candidate_tf_list, project_folder,
                analysis_name, motif_convert_file, motif_database_file):
    """Find motifs.

    Takes the refseq to subpeak seq dict and returns the networkx object with all connections.

    """
    fimo_folder = utils.format_folder(project_folder + 'FIMO/', True)
    subpeak_name = subpeak_fasta.split('/')[-1].split('.')[0]
    output = '{}{}_fimo.txt'.format(fimo_folder, subpeak_name)

    # Create a dictionary to call motif names keyed on gene names
    motif_database = utils.parse_table(motif_convert_file, '\t')
    motif_database_dict = {}  # create a dict keyed by TF with multiple motifs

    for line in motif_database:
        motif_database_dict[line[1]] = []
    for line in motif_database:
        motif_database_dict[line[1]].append(line[0])

    candidate_tf_list.sort()

    print(candidate_tf_list)

    # Now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motif_database_dict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '{}{}_fimo.sh'.format(fimo_folder, analysis_name)
    fimo_bash = open(fimo_bash_path, 'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimo_cmd = 'fimo'
    for motif in motif_list:
        fimo_cmd += ' --motif ' + "'{}'".format(str(motif))

    # fimo_cmd += ' --thresh 1e-5'  # if you want to increase stringency
    fimo_cmd += ' -verbosity 1'
    fimo_cmd += ' -text'
    fimo_cmd += ' -oc ' + project_folder + 'FIMO'
    fimo_cmd += ' --bgfile {}'.format(bg_path)
    fimo_cmd += ' ' + motif_database_file + ' '
    fimo_cmd += subpeak_fasta
    fimo_cmd += ' > ' + output
    print(fimo_cmd)
    fimo_bash.write(fimo_cmd)
    fimo_bash.close()

    subprocess.call(fimo_cmd,
                    shell=True)  # will wait that fimo is done to go on

    return output
Beispiel #2
0
def collapse_fimo(fimo_output, candidate_tf_list, output_folder, analysis_name,
                  motif_convert_file):
    """Collapses motifs from fimo.

    For each source node (TF) and each target node (gene enhancer regions), collapse motif
    instances then spit out a ginormous set of beds and a single crazy collapsed bed.

    """
    # First build up the motif name conversion database
    motif_database = utils.parse_table(motif_convert_file, '\t')
    motif_database_dict = defaultdict(list)

    # The reverse of the other dict, from motif name to gene name
    # A motif can go to multiple genes
    for line in motif_database:
        motif_database_dict[line[0]].append(line[1])

    # Make the folder to store motif beds
    utils.format_folder('{}motif_beds/'.format(output_folder), True)

    edge_dict = {}

    # First layer are source nodes
    for tf in candidate_tf_list:
        edge_dict[tf] = defaultdict(list)
    # Next layer are target nodes which are derived from the fimo output

    fimo_table = utils.parse_table(fimo_output, '\t')
    print(fimo_output)

    # fimo sometimes puts the region in either the first or second column
    fimo_line = fimo_table[1]
    if fimo_line[1].count('|') > 0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN {} OF FIMO OUTPUT FOR REGION'.format(region_index))

    for line in fimo_table[1:]:
        source_tfs = motif_database_dict[line[0]]  # motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[3]),
                                           int(region[2]) + int(line[4]), '.')
            else:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[2]),
                                           int(region[2]) + int(line[3]), '.')

            # What's missing here is the enhancer id of the target locus
            try:
                edge_dict[source][target].append(target_locus)
            except KeyError:
                print('This motif is not in the network')
                print(line)
                sys.exit()

    # Now we actually want to collapse this down in a meaningful way
    # Overlapping motifs count as a single binding site. This way a TF with tons of motifs
    # that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '{}{}_all_motifs.bed'.format(output_folder, analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edge_dict[tf].keys()
        bed_header = [
            'track name = "{}" description="{} motifs in {}"'.format(
                tf, tf, analysis_name)
        ]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '{}motif_beds/{}_motifs.bed'.format(
            output_folder, tf)
        for target in target_nodes:
            edge_collection = utils.LocusCollection(edge_dict[tf][target], 50)
            edge_collection = edge_collection.stitch_collection()
            edge_loci = edge_collection.get_loci()
            edge_dict[tf][target] = edge_loci
            for locus in edge_loci:
                bed_line = [locus.chr, locus.start, locus.end, target, '', '+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unparse_table(target_bed, target_bed_path, '\t')
    # Now the loci are all stitched up
    utils.unparse_table(all_bed, all_bed_path, '\t')
    return edge_dict
Beispiel #3
0
def crc(enhancers, genome_input, chrom_path, output, analysis_name, bam=None, subpeak_file=None,
        mask_file=None, activity_path=None, const_extension=100, number=1, motifs=False, tfs='',
        config=''):
    """CRC main function."""
    # =====================================================================================
    # ===============================I. PARSING ARGUMENTS==================================
    # =====================================================================================

    genome = crc_utils.load_genome(
        genome_input,
        chrom_path,
        mask_file=mask_file,
        config_file=config,
    )

    motif_database_file = genome.return_feature('motif_database')
    motif_convert_file = genome.return_feature('motif_convert')

    # User input files
    enhancer_file = enhancers

    if bam is None and subpeak_file is None:
        print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed')
        sys.exit()

    # Will need to fix bams down the line to take in multiple bams
    if bam:
        bam_file_list = [bam_path for bam_path in bam.split(',') if bam_path]
        print(bam_file_list)
    else:
        bam_file_list = []

    # Output folder and analysis name
    print(output)
    output_folder = utils.format_folder(output, True)

    print(
        '\n\n#======================================\n#===========I. DATA SUMMARY============\n#='
        '=====================================\n'
    )

    print('Analyzing TF connectivity for {}'.format(analysis_name))
    print('Writing output to {}'.format(output_folder))
    if subpeak_file:
        print('Using {} to define subpeaks for motif finding'.format(subpeak_file))
    else:
        print('Identifying valleys from .bam files')
    print('Using {} to define active genes'.format(activity_path))

    # =====================================================================================
    # =======================II. IDENTIFYING CANDIDATE TFS AND NODES=======================
    # =====================================================================================

    print(
        '\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#='
        '=====================================\n'
    )

    (
        gene_region_table,
        gene_tf_region_table,
        enhancer_region_table,
        enhancer_tf_region_table,
        gene_summary_table,
        candidate_tf_list,
        gene_to_enhancer_dict,
    ) = crc_utils.gene_to_enhancer(genome, enhancer_file, activity_path)

    # Write these guys to disk
    gene_out = '{}{}_GENE_TABLE.txt'.format(output_folder, analysis_name)
    gene_tf_out = '{}{}_GENE_TF_TABLE.txt'.format(output_folder, analysis_name)

    enhancer_out = '{}{}_ENHANCER_TABLE.txt'.format(output_folder, analysis_name)
    enhancer_tf_out = '{}{}_ENHANCER_TF_TABLE.txt'.format(output_folder, analysis_name)

    summary_out = '{}{}_GENE_SUMMARY.txt'.format(output_folder, analysis_name)

    utils.unparse_table(enhancer_region_table, enhancer_out, '\t')
    utils.unparse_table(enhancer_tf_region_table, enhancer_tf_out, '\t')

    utils.unparse_table(gene_region_table, gene_out, '\t')
    utils.unparse_table(gene_tf_region_table, gene_tf_out, '\t')

    utils.unparse_table(gene_summary_table, summary_out, '\t')

    print(
        'Identified {} genes w/ proximal cis-regulatory elements'
        ''.format(len(gene_to_enhancer_dict))
    )

    print('Identified {} candidate TFs'.format(len(candidate_tf_list)))
    print(candidate_tf_list)

    # =====================================================================================
    # ==========================III. FINDING VALLEYS/SUBPEAKS==============================
    # =====================================================================================

    print(
        '\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#='
        '=====================================\n'
    )

    # So here we would need to find valleys everywhere
    if subpeak_file is None:
        print('finding valleys')
        # Note: the tf_bed_path is for networks, all is for out degree finding
        all_bed_path = crc_utils.find_valleys(
            gene_to_enhancer_dict, bam_file_list, analysis_name, output_folder, cutoff=0.2
        )
    else:
        print('Using subpeaks from {}'.format(subpeak_file))
        all_bed_path = crc_utils.filter_subpeaks(
            subpeak_file, analysis_name, output_folder
        )

    # First make the subpeak bed and subpeak fasta for the tfs
    all_sub_bed, all_fasta = crc_utils.generate_subpeak_fasta(
        gene_to_enhancer_dict, all_bed_path, genome, analysis_name, const_extension
    )
    if subpeak_file is None:
        # This is the case where we did valleys # only reason you would need to output the sub bed
        all_sub_out = '{}{}_all_subpeak.bed'.format(output_folder, analysis_name)
        utils.unparse_table(all_sub_bed, all_sub_out, '\t')

    # Writing the all subpeak fasta out to disk
    all_fasta_out = '{}{}_all_subpeak.fasta'.format(output_folder, analysis_name)
    utils.unparse_table(all_fasta, all_fasta_out, '')

    # =====================================================================================
    # =================================IV. FINDING MOTIFS==================================
    # =====================================================================================

    print(
        '\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#='
        '=====================================\n'
    )

    # First make background
    bg_path = crc_utils.make_motif_background(all_fasta_out, output_folder, analysis_name)

    # Find motifs for all regions
    fimo_out = crc_utils.find_motifs(
        all_fasta_out,
        bg_path,
        candidate_tf_list,
        output_folder,
        analysis_name,
        motif_convert_file,
        motif_database_file,
    )

    edge_dict = crc_utils.collapse_fimo(
        fimo_out,
        candidate_tf_list,
        output_folder,
        analysis_name,
        motif_convert_file,
    )

    # =====================================================================================
    # ============================V. RUNNING NETWORK ANALYSIS==============================
    # =====================================================================================

    print(
        '\n\n#======================================\n#========V. BUILDING NETWORK===========\n#='
        '=====================================\n'
    )

    print('building graph and edge table')
    graph = crc_utils.build_graph(
        edge_dict,
        gene_to_enhancer_dict,
        output_folder,
        analysis_name,
        cutoff=1,
    )

    crc_utils.format_network_output(graph, output_folder, analysis_name)

    print('FINISHED RUNNING CRC FOR {}'.format(analysis_name))