Example #1
0
def map_enhancer_to_gene_top(rank_by_bam_file, control_bam_file, genome, annot_file,
                             enhancer_file, transcribed_file=None, unique_genes=True,
                             search_window=50000, no_format_table=False):
    """Maps genes to enhancers.

    If unique_genes, reduces to gene name only. Otherwise, gives for each refseq.

    """
    start_dict = utils.make_start_dict(annot_file)
    enhancer_name = enhancer_file.split('/')[-1].split('.')[0]
    enhancer_table = utils.parse_table(enhancer_file, '\t')

    if transcribed_file:
        transcribed_table = utils.parse_table(transcribed_file, '\t')
        transcribed_genes = [line[1] for line in transcribed_table]
    else:
        transcribed_genes = start_dict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribed_collection = utils.make_transcript_collection(
        annot_file, 0, 0, 500, transcribed_genes)

    print('MAKING TSS COLLECTION')
    tss_loci = []
    for gene_id in transcribed_genes:
        tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0))

    # This turns the tss_loci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really matter
    tss_collection = utils.LocusCollection(tss_loci, 50)

    gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)}

    # Dictionaries to hold ranks and super_status of gene nearby enhancers
    rank_dict = defaultdict(list)
    super_dict = defaultdict(list)

    # List of all genes that appear in this analysis
    overall_gene_list = []

    # Find the header
    for line in enhancer_table:
        if line[0][0] != '#':
            header = line
            print('this is the header')
            print(header)
            break

    if no_format_table:
        # Set up the output tables
        # First by enhancer
        enhancer_to_gene_table = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # Set up the output tables
        # First by enhancer
        enhancer_to_gene_table = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

    # Next make the gene to enhancer table
    gene_to_enhancer_table = [
        [
            'GENE_NAME',
            'REFSEQ_ID',
            'PROXIMAL_ENHANCERS',
            'ENHANCER_RANKS',
            'IS_SUPER',
            'ENHANCER_SIGNAL',
        ]
    ]

    for line in enhancer_table:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3])

        enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # Overlapping genes are transcribed genes whose transcript is directly in the
        # stitched_locus
        overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both')
        overlapping_genes = []
        for overlap_locus in overlapping_loci:
            overlapping_genes.append(overlap_locus.id)

        # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the
        # stitched loci
        proximal_loci = tss_collection.get_overlap(
            utils.make_search_locus(enhancer_locus, search_window, search_window),
            'both',
        )
        proximal_genes = []
        for prox_locus in proximal_loci:
            proximal_genes.append(prox_locus.id)

        distal_loci = tss_collection.get_overlap(
            utils.make_search_locus(enhancer_locus, 1000000, 1000000),
            'both',
        )
        distal_genes = []
        for prox_locus in distal_loci:
            distal_genes.append(prox_locus.id)

        overlapping_genes = utils.uniquify(overlapping_genes)
        proximal_genes = utils.uniquify(proximal_genes)
        distal_genes = utils.uniquify(distal_genes)
        all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes
        # These checks make sure each gene list is unique
        # Technically it is possible for a gene to be overlapping, but not proximal since the gene
        # could be longer than the 50kb window, but we'll let that slide here
        for ref_id in overlapping_genes:
            if proximal_genes.count(ref_id) == 1:
                proximal_genes.remove(ref_id)

        for ref_id in proximal_genes:
            if distal_genes.count(ref_id) == 1:
                distal_genes.remove(ref_id)

        # Now find the closest gene
        if not all_enhancer_genes:
            closest_gene = ''
        else:
            # Get enhancer_center
            enhancer_center = (int(line[2]) + int(line[3])) / 2

            # Get absolute distance to enhancer center
            dist_list = [abs(enhancer_center - start_dict[gene_id]['start'][0])
                         for gene_id in all_enhancer_genes]
            # Get the ID and convert to name
            closest_gene = start_dict[all_enhancer_genes[dist_list.index(min(dist_list))]]['name']

        # Now write the row for the enhancer table
        if no_format_table:
            new_enhancer_line = list(line)
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes]))
            )
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes]))
            )
            new_enhancer_line.append(closest_gene)

        else:
            new_enhancer_line = line[0:9]
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes]))
            )
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes]))
            )
            new_enhancer_line.append(closest_gene)
            new_enhancer_line += line[-2:]

        enhancer_to_gene_table.append(new_enhancer_line)

        # Now grab all overlapping and proximal genes for the gene ordered table
        overall_gene_list += overlapping_genes
        for ref_id in overlapping_genes:
            gene_dict['overlapping'][ref_id].append(enhancer_string)
            rank_dict[ref_id].append(int(line[-2]))
            super_dict[ref_id].append(int(line[-1]))

        overall_gene_list += proximal_genes
        for ref_id in proximal_genes:
            gene_dict['proximal'][ref_id].append(enhancer_string)
            rank_dict[ref_id].append(int(line[-2]))
            super_dict[ref_id].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overall_gene_list = utils.uniquify(overall_gene_list)

    # Get the chrom_lists from the various bams here
    cmd = 'samtools idxstats {}'.format(rank_by_bam_file)
    idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    idx_stats = idx_stats.communicate()
    bam_chrom_list = [
        line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2]
    ]

    if control_bam_file:
        cmd = 'samtools idxstats {}'.format(control_bam_file)
        idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        idx_stats = idx_stats.communicate()
        bam_chrom_list_control = [
            line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2]
        ]
        bam_chrom_list = [
            chrom for chrom in bam_chrom_list if bam_chrom_list_control.count(chrom) != 0
        ]

    # Now make sure no genes have a bad chrom
    overall_gene_list = [
        gene for gene in overall_gene_list if bam_chrom_list.count(start_dict[gene]['chr']) != 0
    ]

    # Now make an enhancer collection of all transcripts
    enhancer_gene_collection = utils.make_transcript_collection(
        annot_file,
        5000,
        5000,
        500,
        overall_gene_list,
    )

    enhancer_gene_gff = utils.locus_collection_to_gff(enhancer_gene_collection)

    # Dump the gff to file
    enhancer_folder = utils.get_parent_folder(enhancer_file)
    gff_root_name = "{}_TSS_ENHANCER_GENES_-5000_+5000".format(genome)
    enhancer_gene_gff_file = "{}{}_{}.gff".format(enhancer_folder, enhancer_name, gff_root_name)
    utils.unparse_table(enhancer_gene_gff, enhancer_gene_gff_file, '\t')

    # Now we need to run bam_to_gff

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local
    # (in path), otherwise fail
    bamliquidator_path = 'bamliquidator_batch'

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # Map density at genes in the +/- 5kb tss region
    # First on the rank_by bam
    bam_name = rank_by_bam_file.split('/')[-1]
    mapped_rank_by_folder = "{}{}_{}_{}/".format(
        enhancer_folder,
        enhancer_name,
        gff_root_name,
        bam_name,
    )
    mapped_rank_by_file = "{}{}_{}_{}/matrix.txt".format(
        enhancer_folder,
        enhancer_name,
        gff_root_name,
        bam_name,
    )
    cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format(
        bamliquidator_path,
        enhancer_gene_gff_file,
        mapped_rank_by_folder,
        rank_by_bam_file,
    )
    print("Mapping rankby bam {}".format(rank_by_bam_file))
    print(cmd)
    os.system(cmd)

    # Check for completion
    if utils.check_output(mapped_rank_by_file, 0.2, 5):
        print(
            "SUCCESSFULLY MAPPED TO {} FROM BAM: {}"
            "".format(enhancer_gene_gff_file, rank_by_bam_file)
        )
    else:
        print(
            "ERROR: FAILED TO MAP {} FROM BAM: {}"
            "".format(enhancer_gene_gff_file, rank_by_bam_file)
        )
        sys.exit()

    # Next on the control bam if it exists
    if control_bam_file:
        control_name = control_bam_file.split('/')[-1]
        mapped_control_folder = "{}{}_{}_{}/".format(
            enhancer_folder,
            enhancer_name,
            gff_root_name,
            control_name,
        )
        mapped_control_file = "{}{}_{}_{}/matrix.txt".format(
            enhancer_folder,
            enhancer_name,
            gff_root_name,
            control_name,
        )
        cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format(
            bamliquidator_path,
            enhancer_gene_gff_file,
            mapped_control_folder,
            control_bam_file,
        )
        print("Mapping control bam {}".format(control_bam_file))
        print(cmd)
        os.system(cmd)

        # Check for completion
        if utils.check_output(mapped_control_file, 0.2, 5):
            print(
                "SUCCESSFULLY MAPPED TO {} FROM BAM: {}"
                "".format(enhancer_gene_gff_file, control_bam_file)
            )
        else:
            print(
                "ERROR: FAILED TO MAP {} FROM BAM: {}"
                .format(enhancer_gene_gff_file, control_bam_file)
            )
            sys.exit()

    # Now get the appropriate output files
    if control_bam_file:
        print(
            "CHECKING FOR MAPPED OUTPUT AT {} AND {}"
            "".format(mapped_rank_by_file, mapped_control_file)
        )
        if (utils.check_output(mapped_rank_by_file, 1, 1) and
                utils.check_output(mapped_control_file, 1, 1)):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signal_dict = make_signal_dict(mapped_rank_by_file, mapped_control_file)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT {}".format(mapped_rank_by_file))
        if utils.check_output(mapped_rank_by_file, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signal_dict = make_signal_dict(mapped_rank_by_file)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # Use enhancer rank to order
    rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list])

    used_names = []

    # Make a new dict to hold TSS signal by max per gene_name
    gene_name_sig_dict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rank_order:
        ref_id = overall_gene_list[i]
        gene_name = start_dict[ref_id]['name']
        if used_names.count(gene_name) and unique_genes:
            continue
        else:
            used_names.append(gene_name)

        prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id]

        super_status = max(super_dict[ref_id])
        enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]])

        enhancer_signal = signal_dict[ref_id]
        gene_name_sig_dict[gene_name].append(enhancer_signal)

        new_line = [
            gene_name,
            ref_id,
            ','.join(prox_enhancers),
            enhancer_ranks,
            super_status,
            enhancer_signal
        ]
        gene_to_enhancer_table.append(new_line)

    print('MAKING ENHANCER TO TOP GENE TABLE')

    if no_format_table:
        enhancer_to_top_gene_table = [
            enhancer_to_gene_table[0] + ['TOP_GENE', 'TSS_SIGNAL']
        ]
    else:
        enhancer_to_top_gene_table = [
            enhancer_to_gene_table[0][0:12] +
            ['TOP_GENE', 'TSS_SIGNAL'] +
            enhancer_to_gene_table[0][-2:]
        ]

    for line in enhancer_to_gene_table[1:]:
        gene_list = []
        if no_format_table:
            gene_list += line[-3].split(',')
            gene_list += line[-2].split(',')

        else:
            gene_list += line[10].split(',')
            gene_list += line[11].split(',')

        gene_list = utils.uniquify([x for x in gene_list if x])
        if gene_list:
            try:
                sig_vector = [max(gene_name_sig_dict[x]) for x in gene_list]
                max_index = sig_vector.index(max(sig_vector))
                max_gene = gene_list[max_index]
                max_sig = sig_vector[max_index]
                if max_sig == 0.0:
                    max_gene = 'NONE'
                    max_sig = 'NONE'
            except ValueError:
                if len(gene_list) == 1:
                    max_gene = gene_list[0]
                    max_sig = 'NONE'
                else:
                    max_gene = 'NONE'
                    max_sig = 'NONE'
        else:
            max_gene = 'NONE'
            max_sig = 'NONE'
        if no_format_table:
            new_line = line + [max_gene, max_sig]
        else:
            new_line = line[0:12] + [max_gene, max_sig] + line[-2:]
        enhancer_to_top_gene_table.append(new_line)

    # Resort enhancer_to_gene_table
    if no_format_table:
        return enhancer_to_gene_table, enhancer_to_top_gene_table, gene_to_enhancer_table
    else:
        enhancer_order = utils.order(
            [int(line[-2]) for line in enhancer_to_gene_table[1:]]
        )
        sorted_table = [enhancer_to_gene_table[0]]
        sorted_top_gene_table = [enhancer_to_top_gene_table[0]]
        for i in enhancer_order:
            sorted_table.append(enhancer_to_gene_table[(i + 1)])
            sorted_top_gene_table.append(enhancer_to_top_gene_table[(i + 1)])

        return sorted_table, sorted_top_gene_table, gene_to_enhancer_table
Example #2
0
def map_enhancer_to_gene(annot_file, enhancer_file, transcribed_file=None, unique_genes=True,
                         search_window=50000, no_format_table=False):
    """Maps genes to enhancers.

    If unique_genes, reduces to gene name only. Otherwise, gives for each refseq.

    """
    start_dict = utils.make_start_dict(annot_file)
    enhancer_table = utils.parse_table(enhancer_file, '\t')

    if transcribed_file:
        transcribed_table = utils.parse_table(transcribed_file, '\t')
        transcribed_genes = [line[1] for line in transcribed_table]
    else:
        transcribed_genes = start_dict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribed_collection = utils.make_transcript_collection(
        annot_file,
        0,
        0,
        500,
        transcribed_genes,
    )

    print('MAKING TSS COLLECTION')
    tss_loci = []
    for gene_id in transcribed_genes:
        tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0))

    # This turns the tss_loci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really matter
    tss_collection = utils.LocusCollection(tss_loci, 50)

    gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)}

    # Dictionaries to hold ranks and super_status of gene nearby enhancers
    rank_dict = defaultdict(list)
    super_dict = defaultdict(list)

    # List of all genes that appear in this analysis
    overall_gene_list = []

    # Find the header
    for line in enhancer_table:
        if line[0][0] != '#':
            header = line
            print('This is the header')
            print(header)
            break

    if no_format_table:
        # Set up the output tables
        # First by enhancer
        enhancer_to_gene_table = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']
        ]

    else:
        # Set up the output tables
        # First by enhancer
        enhancer_to_gene_table = [
            header[0:9] +
            ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] +
            header[-2:]
        ]

        # Next by gene
        gene_to_enhancer_table = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # Next make the gene to enhancer table
    gene_to_enhancer_table = [
        [
            'GENE_NAME',
            'REFSEQ_ID',
            'PROXIMAL_ENHANCERS',
            'ENHANCER_RANKS',
            'IS_SUPER',
            'TSS_SIGNAL',
        ]
    ]

    for line in enhancer_table:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3])

        enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # Overlapping genes are transcribed genes whose transcript is directly in the
        # stitched locus
        overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both')
        overlapping_genes = []
        for overlap_locus in overlapping_loci:
            overlapping_genes.append(overlap_locus.id)

        # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the
        # stitched loci
        proximal_loci = tss_collection.get_overlap(
            utils.make_search_locus(enhancer_locus, search_window, search_window),
            'both',
        )
        proximal_genes = []
        for prox_locus in proximal_loci:
            proximal_genes.append(prox_locus.id)

        distal_loci = tss_collection.get_overlap(
            utils.make_search_locus(enhancer_locus, 1000000, 1000000),
            'both',
        )
        distal_genes = []
        for prox_locus in distal_loci:
            distal_genes.append(prox_locus.id)

        overlapping_genes = utils.uniquify(overlapping_genes)
        proximal_genes = utils.uniquify(proximal_genes)
        distal_genes = utils.uniquify(distal_genes)
        all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes

        # These checks make sure each gene list is unique
        # Technically it is possible for a gene to be overlapping, but not proximal since the gene
        # could be longer than the 50kb window, but we'll let that slide here
        for ref_id in overlapping_genes:
            if proximal_genes.count(ref_id) == 1:
                proximal_genes.remove(ref_id)

        for ref_id in proximal_genes:
            if distal_genes.count(ref_id) == 1:
                distal_genes.remove(ref_id)

        # Now find the closest gene
        if not all_enhancer_genes:
            closest_gene = ''
        else:
            # Get enhancer_center
            enhancer_center = (int(line[2]) + int(line[3])) / 2

            # Get absolute distance to enhancer center
            dist_list = [
                abs(
                    enhancer_center - start_dict[gene_id]['start'][0]
                ) for gene_id in all_enhancer_genes
            ]
            # Get the ID and convert to name
            closest_gene = start_dict[
                all_enhancer_genes[dist_list.index(min(dist_list))]
            ]['name']

        # Now write the row for the enhancer table
        if no_format_table:
            new_enhancer_line = list(line)
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes]))
            )
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes]))
            )
            new_enhancer_line.append(closest_gene)

        else:
            new_enhancer_line = line[0:9]
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes]))
            )
            new_enhancer_line.append(
                ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes]))
            )
            new_enhancer_line.append(closest_gene)
            new_enhancer_line += line[-2:]

        enhancer_to_gene_table.append(new_enhancer_line)

        # Now grab all overlapping and proximal genes for the gene ordered table
        overall_gene_list += overlapping_genes
        for ref_id in overlapping_genes:
            gene_dict['overlapping'][ref_id].append(enhancer_string)
            rank_dict[ref_id].append(int(line[-2]))
            super_dict[ref_id].append(int(line[-1]))

        overall_gene_list += proximal_genes
        for ref_id in proximal_genes:
            gene_dict['proximal'][ref_id].append(enhancer_string)
            rank_dict[ref_id].append(int(line[-2]))
            super_dict[ref_id].append(int(line[-1]))

    # End loop through

    # Make table by gene
    overall_gene_list = utils.uniquify(overall_gene_list)

    # Use enhancer rank to order
    rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list])

    used_names = []
    for i in rank_order:
        ref_id = overall_gene_list[i]
        gene_name = start_dict[ref_id]['name']
        if used_names.count(gene_name) and unique_genes:
            continue
        else:
            used_names.append(gene_name)

        prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id]

        super_status = max(super_dict[ref_id])
        enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]])

        new_line = [gene_name, ref_id, ','.join(prox_enhancers), enhancer_ranks, super_status]
        gene_to_enhancer_table.append(new_line)

    # Resort enhancer_to_gene_table
    if no_format_table:
        return enhancer_to_gene_table, gene_to_enhancer_table
    else:
        enhancer_order = utils.order([int(line[-2]) for line in enhancer_to_gene_table[1:]])
        sorted_table = [enhancer_to_gene_table[0]]
        for i in enhancer_order:
            sorted_table.append(enhancer_to_gene_table[(i+1)])

        return sorted_table, gene_to_enhancer_table
Example #3
0
def region_stitching(reference_collection, name, out_folder, stitch_window, tss_window,
                     annot_file, remove_tss=True):
    """Region stitching."""
    print('PERFORMING REGION STITCHING')
    # First have to turn bound region file into a locus collection

    # Need to make sure this names correctly... Each region should have a unique name
    # reference_collection

    debug_output = []
    # Filter out all bound regions that overlap the TSS of an ACTIVE GENE
    if remove_tss:
        print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF {}BP'.format(tss_window))

        # First make a locus collection of TSS
        start_dict = utils.make_start_dict(annot_file)

        # Now make_TSS loci for active genes
        remove_ticker = 0
        # This loop makes a locus centered around +/- tss_window of transcribed genes then adds it
        # to the list tss_loci
        tss_loci = []
        for gene_id in start_dict.keys():
            tss_loci.append(utils.make_tss_locus(gene_id, start_dict, tss_window, tss_window))

        # This turns the tss_loci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tss_collection = utils.LocusCollection(tss_loci, 50)

        # Gives all the loci in reference_collection
        bound_loci = reference_collection.get_loci()

        # This loop will check if each bound region is contained by the TSS exclusion zone
        # This will drop out a lot of the promoter only regions that are tiny
        # Typical exclusion window is around 2kb
        for locus in bound_loci:
            if tss_collection.get_containers(locus, 'both'):
                # If true, the bound locus overlaps an active gene
                reference_collection.remove(locus)
                debug_output.append([str(locus), locus.id, 'CONTAINED'])
                remove_ticker += 1
        print('REMOVED {} LOCI BECAUSE THEY WERE CONTAINED BY A TSS'.format(remove_ticker))

    # Reference_collection is now all enriched region loci that don't overlap an active TSS

    if not stitch_window:
        print('DETERMINING OPTIMUM STITCHING PARAMTER')
        opt_collection = copy.deepcopy(reference_collection)
        stitch_window = optimize_stitching(opt_collection, name, out_folder, step_size=500)
    print('USING A STITCHING PARAMETER OF {}'.format(stitch_window))
    stitched_collection = reference_collection.stitch_collection(stitch_window, 'both')

    if remove_tss:
        # Now replace any stitched region that overlap 2 distinct genes with the original loci
        # that were there
        fixed_loci = []
        tss_loci = []
        for gene_id in start_dict.keys():
            tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 50, 50))

        # This turns the tss_loci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tss_collection = utils.LocusCollection(tss_loci, 50)
        remove_ticker = 0
        original_ticker = 0
        for stitched_locus in stitched_collection.get_loci():
            overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both')
            tss_names = [start_dict[tss_locus.id]['name'] for tss_locus in overlapping_tss_loci]
            tss_names = utils.uniquify(tss_names)
            if len(tss_names) > 2:
                # Stitched_collection.remove(stitched_locus)
                original_loci = reference_collection.get_overlap(stitched_locus, 'both')
                original_ticker += len(original_loci)
                fixed_loci += original_loci
                debug_output.append([str(stitched_locus), stitched_locus.id, 'MULTIPLE_TSS'])
                remove_ticker += 1
            else:
                fixed_loci.append(stitched_locus)

        print(
            'REMOVED {} STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs'.format(remove_ticker)
        )
        print('ADDED BACK {} ORIGINAL LOCI'.format(original_ticker))
        fixed_collection = utils.LocusCollection(fixed_loci, 50)
        return fixed_collection, debug_output, stitch_window

    else:
        return stitched_collection, debug_output, stitch_window
Example #4
0
def map_collection(stitched_collection, reference_collection, bam_file_list, mapped_folder,
                   output, ref_name):
    """Makes a table of factor density in a stitched locus.

    Also ranks table by number of loci stitched together.

    """
    print('FORMATTING TABLE')
    loci = stitched_collection.get_loci()

    locus_table = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']]

    loci_len_list = []

    # Strip out any that are in chrY
    for locus in list(loci):
        if locus.chr == 'chrY':
            loci.remove(locus)

    for locus in loci:
        loci_len_list.append(locus.len())

    len_order = utils.order(loci_len_list, decreasing=True)
    ticker = 0
    for i in len_order:
        ticker += 1
        if not ticker % 1000:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        ref_enrich_size = 0
        ref_overlapping_loci = reference_collection.get_overlap(locus, 'both')
        for ref_locus in ref_overlapping_loci:
            ref_enrich_size += ref_locus.len()

        try:
            stitch_count = int(locus.id.split('_')[0])
        except ValueError:
            stitch_count = 1
        coords = [int(x) for x in locus.coords()]

        locus_table.append(
            [locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size]
        )

    print('GETTING MAPPED DATA')
    print("USING A BAM FILE LIST:")
    print(bam_file_list)
    for bam_file in bam_file_list:

        bam_file_name = bam_file.split('/')[-1]

        print('GETTING MAPPING DATA FOR  {}'.format(bam_file))
        # Assumes standard convention for naming enriched region gffs

        # Opening up the mapped GFF
        print('OPENING {}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name))

        mapped_gff = utils.parse_table(
            '{}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name),
            '\t',
        )

        signal_dict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR {}'.format(bam_file))
        mapped_loci = []
        for line in mapped_gff[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mapped_loci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signal_dict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mapped_collection = utils.LocusCollection(mapped_loci, 500)
        locus_table[0].append(bam_file_name)

        for i in range(1, len(locus_table)):
            signal = 0.0
            line = locus_table[i]
            line_locus = utils.Locus(line[1], line[2], line[3], '.')
            overlapping_regions = mapped_collection.get_overlap(line_locus, sense='both')
            for region in overlapping_regions:
                signal += signal_dict[region.id]
            locus_table[i].append(signal)

    utils.unparse_table(locus_table, output, '\t')
Example #5
0
def rose(input_file,
         rankby,
         output_folder,
         genome,
         bams=None,
         control='',
         stitch=None,
         tss=0,
         mask_file=None):
    """ROSE2 main function."""
    debug = False

    # Making the out folder if it doesn't exist
    out_folder = utils.format_folder(output_folder, True)

    # Figuring out folder schema
    gff_folder = utils.format_folder(out_folder + 'gff/', True)
    mapped_folder = utils.format_folder(out_folder + 'mappedGFF/', True)

    # Getting input file
    if input_file.split('.')[-1] == 'bed':
        # Converting a BED file
        input_gff_name = input_file.split('/')[-1][0:-4]
        input_gff_file = '{}{}.gff'.format(gff_folder, input_gff_name)
        utils.bed_to_gff(input_file, input_gff_file)
    elif input_file.split('.')[-1] == 'gff':
        # Copy the input GFF to the GFF folder
        input_gff_file = input_file
        os.system('cp {} {}'.format(input_gff_file, gff_folder))
    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        # Copy the input GFF to the GFF folder
        input_gff_file = input_file
        os.system('cp {} {}'.format(input_gff_file, gff_folder))

    # Getting the list of BAM files to process
    bam_file_list = [rankby]
    if control:
        bam_file_list.append(control)

    if bams:
        bam_file_list += bams.split(',')

    for bam in bam_file_list:
        if not os.path.isfile('{}.bai'.format(bam)):
            print('INDEX FILE FOR {} IS MISSING'.format(bam))
            sys.exit()

    # Optional args

    # Stitch parameter
    stitch_window = '' if not stitch else int(stitch)

    # TSS options
    tss_window = int(tss)
    remove_tss = True if not tss_window else False

    # Getting the Bound region file used to define enhancers
    print('USING {} AS THE INPUT GFF'.format(input_gff_file))
    input_name = input_gff_file.split('/')[-1].split('.')[0]

    # Getting the genome
    print('USING {} AS THE GENOME'.format(genome))

    # Getting the correct annot file
    annotation_path = '{}/annotation'.format(ROOT_DIR)
    genome_dict = {
        'HG18': '{}/hg18_refseq.ucsc'.format(annotation_path),
        'MM9': '{}/mm9_refseq.ucsc'.format(annotation_path),
        'HG19': '{}/hg19_refseq.ucsc'.format(annotation_path),
        'MM8': '{}/mm8_refseq.ucsc'.format(annotation_path),
        'MM10': '{}/mm10_refseq.ucsc'.format(annotation_path),
        'RN4': '{}/rn4_refseq.ucsc'.format(annotation_path),
        'RN6': '{}/rn6_refseq.ucsc'.format(annotation_path),
    }

    annot_file = genome_dict[genome.upper()]

    # Get chroms found in the bams
    print('GETTING CHROMS IN BAMFILES')
    bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list)
    print("USING THE FOLLOWING CHROMS")
    print(bam_chrom_list)

    # Loading in the GFF and filtering by chrom
    print('LOADING AND FILTERING THE GFF')
    input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list)

    # Loading in the bound region reference collection
    print('LOADING IN GFF REGIONS')
    reference_collection = utils.gff_to_locus_collection(input_gff)

    print('STARTING WITH {} INPUT REGIONS'.format(len(reference_collection)))
    print('CHECKING REFERENCE COLLECTION:')
    rose2_utils.check_ref_collection(reference_collection)

    # Masking reference collection
    # See if there's a mask
    if mask_file:
        print('USING MASK FILE {}'.format(mask_file))
        # if it's a bed file
        if mask_file.split('.')[-1].upper() == 'BED':
            mask_gff = utils.bed_to_gff(mask_file)
        elif mask_file.split('.')[-1].upper() == 'GFF':
            mask_gff = utils.parse_table(mask_file, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")

        mask_collection = utils.gff_to_locus_collection(mask_gff)
        print('LOADING {} MASK REGIONS'.format(len(mask_collection)))

        # Now mask the reference loci
        reference_loci = reference_collection.get_loci()
        filtered_loci = [
            locus for locus in reference_loci
            if not mask_collection.get_overlap(locus, 'both')
        ]
        print("FILTERED OUT {} LOCI THAT WERE MASKED IN {}"
              "".format(len(reference_loci) - len(filtered_loci), mask_file))
        reference_collection = utils.LocusCollection(filtered_loci, 50)

    # Now stitch regions
    print('STITCHING REGIONS TOGETHER')
    stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching(
        reference_collection,
        input_name,
        out_folder,
        stitch_window,
        tss_window,
        annot_file,
        remove_tss,
    )

    # Now make a stitched collection GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitched_gff = utils.locus_collection_to_gff(stitched_collection)
    # Making sure start/stop ordering are correct
    for line in stitched_gff:
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitch_window)
    print(type(stitch_window))
    if not remove_tss:
        stitched_gff_file = '{}{}_{}KB_STITCHED.gff'.format(
            gff_folder,
            input_name,
            str(stitch_window / 1000),
        )
        stitched_gff_name = '{}_{}KB_STITCHED'.format(
            input_name, str(stitch_window / 1000))
        debug_out_file = '{}{}_{}KB_STITCHED.debug'.format(
            gff_folder,
            input_name,
            str(stitch_window / 1000),
        )
    else:
        stitched_gff_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.gff'.format(
            gff_folder,
            input_name,
            str(stitch_window / 1000),
        )
        stitched_gff_name = '{}_{}KB_STITCHED_TSS_DISTAL'.format(
            input_name,
            str(stitch_window / 1000),
        )
        debug_out_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.debug'.format(
            gff_folder,
            input_name,
            str(stitch_window / 1000),
        )

    # Writing debug output to disk
    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS {}'.format(debug_out_file))
        utils.unparse_table(debug_output, debug_out_file, '\t')

    # Write the GFF to disk
    print('WRITING STITCHED GFF TO DISK AS {}'.format(stitched_gff_file))
    utils.unparse_table(stitched_gff, stitched_gff_file, '\t')

    # Setting up the overall output file
    output_file_1 = out_folder + stitched_gff_name + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  {}'.format(output_file_1))

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local
    # (in path), otherwise fail
    bamliquidator_path = 'bamliquidator_batch'

    bam_file_list_unique = list(bam_file_list)
    bam_file_list_unique = utils.uniquify(bam_file_list_unique)
    # Prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bam_file_list_unique)
    for bam_file in bam_file_list_unique:

        bam_file_name = bam_file.split('/')[-1]

        # Mapping to the stitched GFF
        mapped_out_1_folder = '{}{}_{}_MAPPED'.format(
            mapped_folder,
            stitched_gff_name,
            bam_file_name,
        )
        mapped_out_1_file = '{}{}_{}_MAPPED/matrix.txt'.format(
            mapped_folder,
            stitched_gff_name,
            bam_file_name,
        )
        if utils.check_output(mapped_out_1_file, 0.2, 0.2):
            print("FOUND {} MAPPING DATA FOR BAM: {}".format(
                stitched_gff_file, mapped_out_1_file))
        else:
            cmd1 = "{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format(
                bamliquidator_path,
                stitched_gff_file,
                mapped_out_1_folder,
                bam_file,
            )
            print(cmd1)

            os.system(cmd1)
            if utils.check_output(mapped_out_1_file, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO {} FROM BAM: {}"
                      "".format(stitched_gff_file, bam_file_name))
            else:
                print("ERROR: FAILED TO MAP {} FROM BAM: {}"
                      "".format(stitched_gff_file, bam_file_name))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # Calculate density by region
    # TODO: Need to fix this function to account for different outputs of liquidator
    rose2_utils.map_collection(
        stitched_collection,
        reference_collection,
        bam_file_list,
        mapped_folder,
        output_file_1,
        ref_name=stitched_gff_name,
    )

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    control_name = control.split('/')[-1] if control else 'NONE'

    cmd = 'ROSE2_callSuper.R {} {} {} {}'.format(
        out_folder,
        output_file_1,
        input_name,
        control_name,
    )

    print(cmd)

    os.system(cmd)

    # Calling the gene mapper
    time.sleep(5)
    tables = [
        "_SuperEnhancers.table.txt",
        "_StretchEnhancers.table.txt",
        "_SuperStretchEnhancers.table.txt",
    ]
    for table in tables:
        table_file = "{}{}".format(input_name, table)
        genemapper.map(
            os.path.join(out_folder, table_file),
            genome,
            rankby,
            control,
        )