Esempio n. 1
0
def merge_target_site(merge_bed_path, mirna_bed_dir, mirna_list_path):
    """
    Merge all target sites of miRNAs in our miRNA list and save the merged target sites as a BED file
    :param merge_bed_path: a path of the file for the merged peaks
    :param mirna_bed_dir: a directory BED files for miRNA target sites are saved
    :param mirna_list_path: a path of the file containing out miRNAs' names
    """
    eprint('[LOG] Merge all bed files for miRNA target sites')
    with open(mirna_list_path, 'r') as mirna_list_file:
        mirna_names = mirna_list_file.read().splitlines()

    eprint('[LOG] --- Concatenate all bed files')
    concat_cmd = 'cat'
    sort_cmd = 'sort -k1,1 -k2,2n -k3,3n -V'

    for mirna_name in mirna_names:
        concat_cmd += ' %s/%s.bed' % (mirna_bed_dir, mirna_name)

    concat_bed_path = '%s/temp.bed' % mirna_bed_dir  # it is a temporary file
    cmd = '%s | %s > %s;' % (concat_cmd, sort_cmd, concat_bed_path)
    os.system(cmd)

    eprint('[LOG] --- Merge all target sites using bedtools')
    merge_cmd = 'bedtools merge -s -c 4,5,6,7,8,9,10 -o distinct -i %s > %s;' % (
        concat_bed_path, merge_bed_path)
    os.system(merge_cmd)
    os.system('rm %s' % concat_bed_path)
Esempio n. 2
0
def combine_all_chr_stats(result_dir, peak_size_dir, mirna):
    """
    Combine the peak size stats of all chromosomes and save the result
    :param result_dir: a result directory
    :param peak_size_dir: a directory files for peak size stats are stored
    :param mirna: a name of RBP
    """
    eprint('[LOG] Combine all peak size statistics of %s' % mirna)
    genic_regions = genic_region_list()
    genic_regions += ['UTR', 'all']
    chroms = chrom_list()

    peak_size_dict = {genic_region: 0 for genic_region in genic_regions}

    for chrom in chroms:
        peak_size_path = '%s/%s/%s.txt' % (peak_size_dir, chrom, mirna)

        if not os.path.isfile(peak_size_path):
            eprint('[LOG] --- \'%s\' does not exist.' % peak_size_path)
            continue

        with open(peak_size_path, 'r') as peak_size_file:
            for line in peak_size_file.readlines():
                fields = line.strip().split('\t')
                genic_region = fields[0]
                peak_size = int(fields[1])
                peak_size_dict[genic_region] += peak_size

    eprint('[LOG] Save the result')
    with open('%s/%s.txt' % (result_dir, mirna), 'w') as peak_size_file:
        for genic_region in genic_regions:
            print(genic_region,
                  peak_size_dict[genic_region],
                  sep='\t',
                  file=peak_size_file)
Esempio n. 3
0
def concat_peak_size_stats(result_dir, peak_size_dir, mirna_list_path):
    """
     Concat all peak sizes from all RBPs save them as a file for each genic region
    :param result_dir: a result directory
    :param peak_size_dir: a directory peak size files for all RBPs are saved
    :param mirna_list_path: a path of a file containing a mirna list
    """
    eprint('[LOG] Concatenate all peak sizes for each genic region')
    with open(mirna_list_path, 'r') as mirna_list_file:
        mirnas = mirna_list_file.read().splitlines()

    mirnas.append('all_merge')

    genic_regions = genic_region_list()
    genic_regions += ['UTR', 'all']

    result_file_dict = {
        genic_region: open('%s/%s.txt' % (result_dir, genic_region), 'w')
        for genic_region in genic_regions
    }

    for mirna in mirnas:
        peak_size_file_path = '%s/%s.txt' % (peak_size_dir, mirna)

        with open(peak_size_file_path, 'r') as peak_size_file:
            for line in peak_size_file.readlines():
                fields = line.strip().split('\t')
                genic_region = fields[0]
                peak_size = int(fields[1])

                print(mirna,
                      peak_size,
                      sep='\t',
                      file=result_file_dict[genic_region])

    for genic_region in genic_regions:
        result_file_dict[genic_region].close()
Esempio n. 4
0
def parse_peak_file(result_dir, bed_file_path, anno_dir, chrom):
    """
    Make MutPeak objects by parsing the BED file and save the objects
    :param result_dir: a directory parsed peak will be saved.
    :param bed_file_path: a file that has a narrow peak bed file format
    :param anno_dir: a directory genome annotation data is saved.
    :param chrom: a chromosome ID
    """
    eprint('[LOG] Parse the peaks from \'%s\'' % bed_file_path)
    eprint('[LOG] ---- Chromosome ID: %s' % chrom)
    peaks = []

    # Parse the NarrowPeak bed file and make 'MutPeak' objects
    with open(bed_file_path, 'r') as bed_file:
        for line in bed_file:
            if line.startswith('%s\t' % chrom):
                peak = MutPeak()
                peak.parse_peak_entry(line.strip())
                peaks.append(peak)

    # parse the genic regions on the peaks
    chr_size = genome.get_chr_size(chrom)

    # TODO: change the algorithm of 'anno_peak' to reduce I/O.
    for peak in peaks:
        anno_peak(anno_dir, peak, chr_size)

    # saved the result
    chr_result_dir = '%s/%s' % (result_dir, chrom)
    os.makedirs(chr_result_dir, exist_ok=True)

    result_filename = os.path.basename(bed_file_path).replace('.bed', '.dat')
    result_file_path = '%s/%s' % (chr_result_dir, result_filename)

    with open(result_file_path, 'wb') as result_file:
        pickle.dump(peaks, result_file)
Esempio n. 5
0
def main():
    # qsub settings
    script = os.path.abspath(__file__)
    queue = 'workq'
    is_test = False

    job_name = 'Minu.Get.miRNA.TS.Size'
    log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp())

    if not is_test:
        os.makedirs(log_dir, exist_ok=True)

    # param settings
    chroms = chrom_list()
    cons_score_cutoff = 2.0
    step = 1

    # path settings
    # %s in 'phylop_path_format': a chromosome ID
    phylop_path_format = '/extdata6/Minwoo/data/phyloP/{0}/100way-data/%s.phyloP100way.dat'.format(
        GENOME_VER)
    mirna_list_path = '%s/data/cons_mature_human_list.txt' % PROJECT_DIR
    target_site_dir = '%s/results/target-sites/%s' % (PROJECT_DIR, GENOME_VER)
    peak_data_dir = '%s/peak-data' % target_site_dir
    peak_size_dir = '%s/peak-size/phyloP-%.1f' % (
        target_site_dir, cons_score_cutoff)  # a result directory
    all_chr_peak_size_dir = '%s/all' % peak_size_dir

    if not os.path.isdir(peak_data_dir):
        eprint('[ERROR] in %s' % caller_file_and_line())
        sys.exit('\t\'%s\' does not exist. Run 03_parser.py.' % peak_data_dir)

    with open(mirna_list_path, 'r') as mirna_list_file:
        mirnas = mirna_list_file.read().splitlines()

    mirnas.append('high_cons_mir')  # merged miRNA target sites

    if step == 1:  # get peak sizes for binding sites of each RBP on each chromosome
        for mirna in mirnas:
            cmd = ''
            cmd_cnt = 0
            cmd_idx = 1

            for chrom in chroms:
                # miRNA and chromosome-specific path settings
                chr_peak_data_dir = '%s/%s' % (peak_data_dir, chrom)
                chr_peak_size_dir = '%s/%s' % (peak_size_dir, chrom)
                os.makedirs(chr_peak_size_dir, exist_ok=True)

                peak_data_path = '%s/%s.dat' % (chr_peak_data_dir, mirna)
                peak_size_path = '%s/%s.txt' % (chr_peak_size_dir, mirna)
                cons_score_path = phylop_path_format % chrom

                cmd += '%s make_peak_size_stats %s %s %s %s %s %.1f;' % \
                       (script, peak_size_path, peak_data_path, chrom, 'True', cons_score_path, cons_score_cutoff)
                cmd_cnt += 1

                if cmd_cnt == 4:
                    if is_test:
                        print(cmd)
                    else:
                        one_job_name = '%s.%s.%s' % (job_name, mirna, cmd_idx)
                        one_log_path = '%s/%s.txt' % (log_dir, one_job_name)
                        os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' %
                                  (cmd, one_log_path, queue, one_job_name))

                    # reset
                    cmd = ''
                    cmd_cnt = 0
                    cmd_idx += 1

    if step == 2:  # combine results from all chromosomes
        job_name = 'Minu.Combine.All.Chr.Peak.Size'
        log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp())

        if not is_test:
            os.makedirs(log_dir, exist_ok=True)

        os.makedirs(all_chr_peak_size_dir,
                    exist_ok=True)  # make a result directory for this step

        for mirna in mirnas:
            cmd = '%s combine_all_chr_stats %s %s %s' % (
                script, all_chr_peak_size_dir, peak_size_dir, mirna)

            if is_test:
                print(cmd)
            else:
                one_job_name = '%s.%s' % (job_name, mirna)
                one_log_path = '%s/%s.txt' % (log_dir, one_job_name)
                os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' %
                          (cmd, one_log_path, queue, one_job_name))

    if step == 3:  # concat all peak sizes of all miRNA target sites for each gene-based annotation
        job_name = 'Minu.Concat.Peak.Size.by.Anno'
        log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp())

        if not is_test:
            os.makedirs(log_dir, exist_ok=True)

        concat_peak_size_dir = '%s/by-anno' % peak_size_dir
        os.makedirs(concat_peak_size_dir, exist_ok=True)

        cmd = '%s concat_peak_size_stats %s %s %s' % \
              (script, concat_peak_size_dir, all_chr_peak_size_dir, mirna_list_path)

        if is_test:
            print(cmd)
        else:
            one_job_name = job_name
            one_log_path = '%s/%s.txt' % (log_dir, one_job_name)
            os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' %
                      (cmd, one_log_path, queue, one_job_name))
Esempio n. 6
0
def make_peak_size_stats(peak_size_path,
                         peak_data_path,
                         chrom,
                         only_repr,
                         cons_score_path=None,
                         cons_score_cutoff=-14.0):
    """
    Get the peak sizes from the peak data and save the result as a file
    :param peak_size_path: a path of the result
    :param peak_data_path: a path of the peak data
    :param chrom: a chromosome ID
    :param only_repr: if True, only consider representative genic region
    :param cons_score_path: a path of a file containing an array of conservation scores in the same chromosome
    :param cons_score_cutoff: a float (-14.0 is the minimum)
    """
    eprint('[LOG] Get the peak size from the peak data')
    only_repr = eval(only_repr)
    cons_score_cutoff = float(cons_score_cutoff)
    genic_regions = genic_region_list()
    chr_size = genome.get_chr_size(chrom)

    if cons_score_path is None:
        chr_cons_scores = array.array('f', [cons_score_cutoff] * chr_size)
    else:
        with open(cons_score_path, 'rb') as infile:
            chr_cons_scores = array.array('f', [])
            chr_cons_scores.fromfile(infile, chr_size)

    if not os.path.isfile(peak_data_path):
        eprint('[LOG] --- \'%s\' does not exist.' % peak_data_path)
        return

    with open(peak_data_path, 'rb') as peak_data_file:
        peaks = pickle.load(peak_data_file)

    peak_size_dict = {genic_region: 0 for genic_region in genic_regions}
    peak_size_dict['UTR'] = 0  # UTR: all UTR (5'UTR or 3'UTR)
    peak_size_dict['all'] = 0  # all genic regions

    for peak in peaks:
        peak_cons_scores = chr_cons_scores[peak.start:peak.end]
        anno_vals = peak.get_anno_vals()

        for i, anno_val in enumerate(anno_vals):
            if peak_cons_scores[i] >= cons_score_cutoff:
                peak_size_dict['all'] += 1

                if only_repr:
                    both_utr, repr_genic_region = get_repr_anno(anno_val)

                    if both_utr:
                        peak_size_dict['5UTR'] += 1
                        peak_size_dict['3UTR'] += 1
                    else:
                        peak_size_dict[repr_genic_region] += 1

                    if repr_genic_region.endswith('UTR'):
                        peak_size_dict['UTR'] += 1
                else:
                    anno_dict = parse_anno_val(anno_val)

                    for genic_region in genic_regions:
                        if anno_dict[genic_region]:
                            peak_size_dict[genic_region] += 1

                            if genic_region.endswith('UTR'):
                                peak_size_dict['UTR'] += 1

    eprint('[LOG] Save the result')
    genic_regions += ['UTR', 'all']

    with open(peak_size_path, 'w') as peak_size_file:
        for genic_region in genic_regions:
            print(genic_region,
                  peak_size_dict[genic_region],
                  sep='\t',
                  file=peak_size_file)
Esempio n. 7
0
def make_mirna_target_bed(result_dir, mirna_fa_path, mirna_list_path,
                          refflat_data_path):
    """
    Make BED files documented miRNA target sites for representative isoforms
    :param result_dir: a result directory
    :param mirna_fa_path: a path of miRNA fasta file from miRBase
    :param mirna_list_path: a path of a list of miRNAs currently focused on
    :param refflat_data_path: a path of the data file (.dat) containing a list of genes (representative isoforms)
    """
    eprint('[LOG] Make bed files for miRNA target sites')
    mirnas = get_mirnas(mirna_fa_path, mirna_list_path)

    with open(refflat_data_path, 'rb') as gene_file:
        genes = pickle.load(gene_file)

    for mirna in mirnas:
        eprint('[LOG] --- miRNA: %s' % mirna.name)
        target_site_peaks = []  # element: a 'NarrowPeak' object

        for gene in genes:
            coords_3utr = get_3utr_coord(gene)
            rna_3utr_seq = Seq(gene.seq_3utr.replace('T', 'U'))
            target_sites = mirna.find_targetsites(rna_3utr_seq, 'cst')

            # convert target site coordinates to peaks
            for target_site in target_sites:
                if target_site.type == '6mer':  # skip the 6mer
                    continue

                target_coords = find_target_coord(target_site, coords_3utr)

                if len(target_coords) == 1:
                    target_name = '%s;%s;%s;%d' % (gene.symbol, gene.id,
                                                   target_site.type, 0)
                    target_peak = NarrowPeak(gene.chrom,
                                             target_coords[0][0],
                                             target_coords[0][1],
                                             gene.strand,
                                             name=target_name)
                    target_site_peaks.append(target_peak)
                else:
                    for i, coord in enumerate(target_coords):
                        target_name = '%s;%s;%s;%d' % (gene.symbol, gene.id,
                                                       target_site.type, i + 1)
                        target_peak = NarrowPeak(gene.chrom,
                                                 target_coords[0][0],
                                                 target_coords[0][1],
                                                 gene.strand,
                                                 name=target_name)
                        target_site_peaks.append(target_peak)

        target_site_peaks.sort(
            key=lambda peak: (peak.chrom[3:], peak.start, peak.end))

        # make bed files for the target sites
        mirna_bed_path = '%s/%s.bed' % (result_dir, mirna.name)

        with open(mirna_bed_path, 'w') as mirna_bed_file:
            for target_site_peak in target_site_peaks:
                print(target_site_peak, file=mirna_bed_file)
    eprint()
Esempio n. 8
0
def main():
    """
    Bootstrap
    """
    # settings for a job scheduler
    script = os.path.abspath(__file__)
    queue = 'workq'
    is_test = False

    job_name = 'Minu.Parse.miRNA.Target'
    log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp())

    if not is_test:
        os.makedirs(log_dir, exist_ok=True)

    # path settings
    anno_dir = '/extdata6/Minwoo/projects/repr-gene/results/genome-anno/%s' % GENOME_VER
    mirna_list_path = '%s/data/cons_mature_human_list.txt' % PROJECT_DIR

    target_site_dir = '%s/results/target-sites/%s' % (PROJECT_DIR, GENOME_VER)
    mirna_ts_bed_dir = '%s/bed' % target_site_dir
    mirna_ts_data_dir = '%s/peak-data' % target_site_dir  # a result directory

    if not os.path.isdir(anno_dir):
        eprint('[ERROR] in %s' % caller_file_and_line())
        sys.exit(
            '\t\'%s\' does not exist. Run repr-gene/03_anno_genome.py first.' %
            anno_dir)

    if not os.path.isdir(mirna_ts_bed_dir):
        eprint('[ERROR] in %s' % caller_file_and_line())
        sys.exit('\t\'%s\' do not exist. Run 02_target_site.py first.' %
                 mirna_ts_bed_dir)

    os.makedirs(mirna_ts_data_dir, exist_ok=True)

    with open(mirna_list_path, 'r') as mirna_list_file:
        mirnas = mirna_list_file.read().splitlines()

    chroms = chrom_list()

    # Target sites of an individual miRNA
    for mirna in mirnas:
        mirna_ts_bed_path = '%s/%s.bed' % (mirna_ts_bed_dir, mirna)
        assert os.path.isfile(mirna_ts_bed_path)

        cmd = ''
        cmd_cnt = 0
        cmd_idx = 1

        for chrom in chroms:
            cmd += '%s parse_peak_file %s %s %s %s;' % \
                   (script, mirna_ts_data_dir, mirna_ts_bed_path, anno_dir, chrom)
            cmd_cnt += 1

            if cmd_cnt == 4:  # one job for 4 chromosomes
                if is_test:
                    print(cmd)
                else:
                    one_job_name = '%s.%s.%s' % (job_name, mirna, cmd_idx)
                    one_log_path = '%s/%s.txt' % (log_dir, one_job_name)
                    os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' %
                              (cmd, one_log_path, queue, one_job_name))

                # reset
                cmd = ''
                cmd_cnt = 0
                cmd_idx += 1

    # Parsing merged miRNA target sites
    merged_mirna_bed_path = '%s/high_cons_mir.bed' % mirna_ts_bed_dir
    assert os.path.isfile(merged_mirna_bed_path)

    for chrom in chroms:
        cmd = '%s parse_peak_file %s %s %s %s' % \
              (script, mirna_ts_data_dir, merged_mirna_bed_path, anno_dir, chrom)

        if is_test:
            print(cmd)
        else:
            one_job_name = '%s.%s.%s' % (job_name, 'All', chrom)
            one_log_path = '%s/%s.txt' % (log_dir, one_job_name)
            os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' %
                      (cmd, one_log_path, queue, job_name))
Esempio n. 9
0
def anno_peak(anno_dir, peak, chr_size):
    """
    Read data about the genic regions of the peak and save the data on the object for the peak

    :param anno_dir: a directory genome annotation data is saved.
    :param peak: a 'RBPPeak' object
    :param chr_size: a size of chromosome ID same with the 'chrom' of the peak
                     this parameter is necessary to know the end position of the last chromosomal fragment.
    """
    assert peak.__class__.__name__ == 'MutPeak'

    peak_start, peak_end = peak.get_position()
    assert peak_end <= chr_size

    if peak.strand == '+':
        peak_strand = 'top'
    else:  # '-'
        peak_strand = 'btm'

    bin_size = 1000000
    start_digit = int(peak_start / bin_size)  # Most significant digit
    end_digit = int(peak_end / bin_size)
    digit_diff = end_digit - start_digit

    # Split the peak with consideration of the bin sizes of the genome annotation files
    peak_frags = []  # A list of tuple (peak_frag_start, peak_frag_end)
    peak_frag_start = peak_start
    peak_frag_end = start_digit * bin_size

    for i in range(digit_diff):
        peak_frag_end += bin_size
        peak_frags.append((peak_frag_start, peak_frag_end))
        peak_frag_start = peak_frag_end

    peak_frag_end = peak_end
    peak_frags.append((peak_frag_start, peak_frag_end))

    # Read the array of genic regions and concatenates them
    chr_anno_dir = '%s/%s' % (anno_dir, peak.chrom)

    if not os.path.isdir(anno_dir):
        eprint('[ERROR] in %s: %s does not exist.' %
               (caller_file_and_line(), anno_dir))
        sys.exit()

    total_frag_len = 0
    anno_val_arr = []

    for peak_frag_start, peak_frag_end in peak_frags:
        # Chromosomal bins: 0-1000000, 1000000-2000000, ...
        chr_bin_start = int(peak_frag_start / bin_size) * bin_size
        chr_bin_end = chr_bin_start + bin_size

        if chr_bin_end > chr_size:
            chr_bin_end = chr_size

        if peak_frag_end < chr_bin_end:
            frag_len = peak_frag_end - peak_frag_start
        else:
            frag_len = chr_bin_end - peak_frag_start

        total_frag_len += frag_len

        frag_anno_val_arr = array.array('i', [])
        arr_item_size = frag_anno_val_arr.itemsize

        anno_file = open(
            '%s/%d_%d_%s.dat' %
            (chr_anno_dir, chr_bin_start, chr_bin_end, peak_strand), 'rb')
        anno_file.seek((peak_frag_start - chr_bin_start) *
                       arr_item_size)  # relative bin peak_start position
        frag_anno_val_arr.fromfile(anno_file, frag_len)
        anno_file.close()

        anno_val_arr += frag_anno_val_arr

    peak.gene_based_anno(list(anno_val_arr))
    assert total_frag_len == (peak_end - peak_start)
Esempio n. 10
0
import re

# input path settings
mirbase_file_path = '/extdata6/Minwoo/data/miRNA/mature.fa'
targetscan_fam_info_path = '/extdata6/Minwoo/data/miRNA/miR_Family_Info.txt'

# result path settings
pre_proc_dir = '%s/data' % PROJECT_DIR
os.makedirs(pre_proc_dir, exist_ok=True)

human_mir_file_path = '%s/mature_human.fa' % pre_proc_dir
human_mir_list_path = '%s/mature_human_list.txt' % pre_proc_dir
cons_human_mir_list_path = '%s/cons_mature_human_list.txt' % pre_proc_dir  # conserved miRNAs

# Parse the mature.fa
eprint('[LOG] Extract human miRNAs from mirBase')
mirna_info = []  # element: (header, seq)

with open(mirbase_file_path, 'r') as mirbase_file:
    while True:
        header = mirbase_file.readline()

        if not header:  # EOF
            break

        seq = mirbase_file.readline()
        mirna_info.append((header.strip(), seq.strip()))

eprint('[LOG] The number of miRNAs from mirBase: %d' % len(mirna_info))

# Collect only human miRNAs with an annotation number less than 1000