def merge_target_site(merge_bed_path, mirna_bed_dir, mirna_list_path): """ Merge all target sites of miRNAs in our miRNA list and save the merged target sites as a BED file :param merge_bed_path: a path of the file for the merged peaks :param mirna_bed_dir: a directory BED files for miRNA target sites are saved :param mirna_list_path: a path of the file containing out miRNAs' names """ eprint('[LOG] Merge all bed files for miRNA target sites') with open(mirna_list_path, 'r') as mirna_list_file: mirna_names = mirna_list_file.read().splitlines() eprint('[LOG] --- Concatenate all bed files') concat_cmd = 'cat' sort_cmd = 'sort -k1,1 -k2,2n -k3,3n -V' for mirna_name in mirna_names: concat_cmd += ' %s/%s.bed' % (mirna_bed_dir, mirna_name) concat_bed_path = '%s/temp.bed' % mirna_bed_dir # it is a temporary file cmd = '%s | %s > %s;' % (concat_cmd, sort_cmd, concat_bed_path) os.system(cmd) eprint('[LOG] --- Merge all target sites using bedtools') merge_cmd = 'bedtools merge -s -c 4,5,6,7,8,9,10 -o distinct -i %s > %s;' % ( concat_bed_path, merge_bed_path) os.system(merge_cmd) os.system('rm %s' % concat_bed_path)
def combine_all_chr_stats(result_dir, peak_size_dir, mirna): """ Combine the peak size stats of all chromosomes and save the result :param result_dir: a result directory :param peak_size_dir: a directory files for peak size stats are stored :param mirna: a name of RBP """ eprint('[LOG] Combine all peak size statistics of %s' % mirna) genic_regions = genic_region_list() genic_regions += ['UTR', 'all'] chroms = chrom_list() peak_size_dict = {genic_region: 0 for genic_region in genic_regions} for chrom in chroms: peak_size_path = '%s/%s/%s.txt' % (peak_size_dir, chrom, mirna) if not os.path.isfile(peak_size_path): eprint('[LOG] --- \'%s\' does not exist.' % peak_size_path) continue with open(peak_size_path, 'r') as peak_size_file: for line in peak_size_file.readlines(): fields = line.strip().split('\t') genic_region = fields[0] peak_size = int(fields[1]) peak_size_dict[genic_region] += peak_size eprint('[LOG] Save the result') with open('%s/%s.txt' % (result_dir, mirna), 'w') as peak_size_file: for genic_region in genic_regions: print(genic_region, peak_size_dict[genic_region], sep='\t', file=peak_size_file)
def concat_peak_size_stats(result_dir, peak_size_dir, mirna_list_path): """ Concat all peak sizes from all RBPs save them as a file for each genic region :param result_dir: a result directory :param peak_size_dir: a directory peak size files for all RBPs are saved :param mirna_list_path: a path of a file containing a mirna list """ eprint('[LOG] Concatenate all peak sizes for each genic region') with open(mirna_list_path, 'r') as mirna_list_file: mirnas = mirna_list_file.read().splitlines() mirnas.append('all_merge') genic_regions = genic_region_list() genic_regions += ['UTR', 'all'] result_file_dict = { genic_region: open('%s/%s.txt' % (result_dir, genic_region), 'w') for genic_region in genic_regions } for mirna in mirnas: peak_size_file_path = '%s/%s.txt' % (peak_size_dir, mirna) with open(peak_size_file_path, 'r') as peak_size_file: for line in peak_size_file.readlines(): fields = line.strip().split('\t') genic_region = fields[0] peak_size = int(fields[1]) print(mirna, peak_size, sep='\t', file=result_file_dict[genic_region]) for genic_region in genic_regions: result_file_dict[genic_region].close()
def parse_peak_file(result_dir, bed_file_path, anno_dir, chrom): """ Make MutPeak objects by parsing the BED file and save the objects :param result_dir: a directory parsed peak will be saved. :param bed_file_path: a file that has a narrow peak bed file format :param anno_dir: a directory genome annotation data is saved. :param chrom: a chromosome ID """ eprint('[LOG] Parse the peaks from \'%s\'' % bed_file_path) eprint('[LOG] ---- Chromosome ID: %s' % chrom) peaks = [] # Parse the NarrowPeak bed file and make 'MutPeak' objects with open(bed_file_path, 'r') as bed_file: for line in bed_file: if line.startswith('%s\t' % chrom): peak = MutPeak() peak.parse_peak_entry(line.strip()) peaks.append(peak) # parse the genic regions on the peaks chr_size = genome.get_chr_size(chrom) # TODO: change the algorithm of 'anno_peak' to reduce I/O. for peak in peaks: anno_peak(anno_dir, peak, chr_size) # saved the result chr_result_dir = '%s/%s' % (result_dir, chrom) os.makedirs(chr_result_dir, exist_ok=True) result_filename = os.path.basename(bed_file_path).replace('.bed', '.dat') result_file_path = '%s/%s' % (chr_result_dir, result_filename) with open(result_file_path, 'wb') as result_file: pickle.dump(peaks, result_file)
def main(): # qsub settings script = os.path.abspath(__file__) queue = 'workq' is_test = False job_name = 'Minu.Get.miRNA.TS.Size' log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp()) if not is_test: os.makedirs(log_dir, exist_ok=True) # param settings chroms = chrom_list() cons_score_cutoff = 2.0 step = 1 # path settings # %s in 'phylop_path_format': a chromosome ID phylop_path_format = '/extdata6/Minwoo/data/phyloP/{0}/100way-data/%s.phyloP100way.dat'.format( GENOME_VER) mirna_list_path = '%s/data/cons_mature_human_list.txt' % PROJECT_DIR target_site_dir = '%s/results/target-sites/%s' % (PROJECT_DIR, GENOME_VER) peak_data_dir = '%s/peak-data' % target_site_dir peak_size_dir = '%s/peak-size/phyloP-%.1f' % ( target_site_dir, cons_score_cutoff) # a result directory all_chr_peak_size_dir = '%s/all' % peak_size_dir if not os.path.isdir(peak_data_dir): eprint('[ERROR] in %s' % caller_file_and_line()) sys.exit('\t\'%s\' does not exist. Run 03_parser.py.' % peak_data_dir) with open(mirna_list_path, 'r') as mirna_list_file: mirnas = mirna_list_file.read().splitlines() mirnas.append('high_cons_mir') # merged miRNA target sites if step == 1: # get peak sizes for binding sites of each RBP on each chromosome for mirna in mirnas: cmd = '' cmd_cnt = 0 cmd_idx = 1 for chrom in chroms: # miRNA and chromosome-specific path settings chr_peak_data_dir = '%s/%s' % (peak_data_dir, chrom) chr_peak_size_dir = '%s/%s' % (peak_size_dir, chrom) os.makedirs(chr_peak_size_dir, exist_ok=True) peak_data_path = '%s/%s.dat' % (chr_peak_data_dir, mirna) peak_size_path = '%s/%s.txt' % (chr_peak_size_dir, mirna) cons_score_path = phylop_path_format % chrom cmd += '%s make_peak_size_stats %s %s %s %s %s %.1f;' % \ (script, peak_size_path, peak_data_path, chrom, 'True', cons_score_path, cons_score_cutoff) cmd_cnt += 1 if cmd_cnt == 4: if is_test: print(cmd) else: one_job_name = '%s.%s.%s' % (job_name, mirna, cmd_idx) one_log_path = '%s/%s.txt' % (log_dir, one_job_name) os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' % (cmd, one_log_path, queue, one_job_name)) # reset cmd = '' cmd_cnt = 0 cmd_idx += 1 if step == 2: # combine results from all chromosomes job_name = 'Minu.Combine.All.Chr.Peak.Size' log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp()) if not is_test: os.makedirs(log_dir, exist_ok=True) os.makedirs(all_chr_peak_size_dir, exist_ok=True) # make a result directory for this step for mirna in mirnas: cmd = '%s combine_all_chr_stats %s %s %s' % ( script, all_chr_peak_size_dir, peak_size_dir, mirna) if is_test: print(cmd) else: one_job_name = '%s.%s' % (job_name, mirna) one_log_path = '%s/%s.txt' % (log_dir, one_job_name) os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' % (cmd, one_log_path, queue, one_job_name)) if step == 3: # concat all peak sizes of all miRNA target sites for each gene-based annotation job_name = 'Minu.Concat.Peak.Size.by.Anno' log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp()) if not is_test: os.makedirs(log_dir, exist_ok=True) concat_peak_size_dir = '%s/by-anno' % peak_size_dir os.makedirs(concat_peak_size_dir, exist_ok=True) cmd = '%s concat_peak_size_stats %s %s %s' % \ (script, concat_peak_size_dir, all_chr_peak_size_dir, mirna_list_path) if is_test: print(cmd) else: one_job_name = job_name one_log_path = '%s/%s.txt' % (log_dir, one_job_name) os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' % (cmd, one_log_path, queue, one_job_name))
def make_peak_size_stats(peak_size_path, peak_data_path, chrom, only_repr, cons_score_path=None, cons_score_cutoff=-14.0): """ Get the peak sizes from the peak data and save the result as a file :param peak_size_path: a path of the result :param peak_data_path: a path of the peak data :param chrom: a chromosome ID :param only_repr: if True, only consider representative genic region :param cons_score_path: a path of a file containing an array of conservation scores in the same chromosome :param cons_score_cutoff: a float (-14.0 is the minimum) """ eprint('[LOG] Get the peak size from the peak data') only_repr = eval(only_repr) cons_score_cutoff = float(cons_score_cutoff) genic_regions = genic_region_list() chr_size = genome.get_chr_size(chrom) if cons_score_path is None: chr_cons_scores = array.array('f', [cons_score_cutoff] * chr_size) else: with open(cons_score_path, 'rb') as infile: chr_cons_scores = array.array('f', []) chr_cons_scores.fromfile(infile, chr_size) if not os.path.isfile(peak_data_path): eprint('[LOG] --- \'%s\' does not exist.' % peak_data_path) return with open(peak_data_path, 'rb') as peak_data_file: peaks = pickle.load(peak_data_file) peak_size_dict = {genic_region: 0 for genic_region in genic_regions} peak_size_dict['UTR'] = 0 # UTR: all UTR (5'UTR or 3'UTR) peak_size_dict['all'] = 0 # all genic regions for peak in peaks: peak_cons_scores = chr_cons_scores[peak.start:peak.end] anno_vals = peak.get_anno_vals() for i, anno_val in enumerate(anno_vals): if peak_cons_scores[i] >= cons_score_cutoff: peak_size_dict['all'] += 1 if only_repr: both_utr, repr_genic_region = get_repr_anno(anno_val) if both_utr: peak_size_dict['5UTR'] += 1 peak_size_dict['3UTR'] += 1 else: peak_size_dict[repr_genic_region] += 1 if repr_genic_region.endswith('UTR'): peak_size_dict['UTR'] += 1 else: anno_dict = parse_anno_val(anno_val) for genic_region in genic_regions: if anno_dict[genic_region]: peak_size_dict[genic_region] += 1 if genic_region.endswith('UTR'): peak_size_dict['UTR'] += 1 eprint('[LOG] Save the result') genic_regions += ['UTR', 'all'] with open(peak_size_path, 'w') as peak_size_file: for genic_region in genic_regions: print(genic_region, peak_size_dict[genic_region], sep='\t', file=peak_size_file)
def make_mirna_target_bed(result_dir, mirna_fa_path, mirna_list_path, refflat_data_path): """ Make BED files documented miRNA target sites for representative isoforms :param result_dir: a result directory :param mirna_fa_path: a path of miRNA fasta file from miRBase :param mirna_list_path: a path of a list of miRNAs currently focused on :param refflat_data_path: a path of the data file (.dat) containing a list of genes (representative isoforms) """ eprint('[LOG] Make bed files for miRNA target sites') mirnas = get_mirnas(mirna_fa_path, mirna_list_path) with open(refflat_data_path, 'rb') as gene_file: genes = pickle.load(gene_file) for mirna in mirnas: eprint('[LOG] --- miRNA: %s' % mirna.name) target_site_peaks = [] # element: a 'NarrowPeak' object for gene in genes: coords_3utr = get_3utr_coord(gene) rna_3utr_seq = Seq(gene.seq_3utr.replace('T', 'U')) target_sites = mirna.find_targetsites(rna_3utr_seq, 'cst') # convert target site coordinates to peaks for target_site in target_sites: if target_site.type == '6mer': # skip the 6mer continue target_coords = find_target_coord(target_site, coords_3utr) if len(target_coords) == 1: target_name = '%s;%s;%s;%d' % (gene.symbol, gene.id, target_site.type, 0) target_peak = NarrowPeak(gene.chrom, target_coords[0][0], target_coords[0][1], gene.strand, name=target_name) target_site_peaks.append(target_peak) else: for i, coord in enumerate(target_coords): target_name = '%s;%s;%s;%d' % (gene.symbol, gene.id, target_site.type, i + 1) target_peak = NarrowPeak(gene.chrom, target_coords[0][0], target_coords[0][1], gene.strand, name=target_name) target_site_peaks.append(target_peak) target_site_peaks.sort( key=lambda peak: (peak.chrom[3:], peak.start, peak.end)) # make bed files for the target sites mirna_bed_path = '%s/%s.bed' % (result_dir, mirna.name) with open(mirna_bed_path, 'w') as mirna_bed_file: for target_site_peak in target_site_peaks: print(target_site_peak, file=mirna_bed_file) eprint()
def main(): """ Bootstrap """ # settings for a job scheduler script = os.path.abspath(__file__) queue = 'workq' is_test = False job_name = 'Minu.Parse.miRNA.Target' log_dir = '%s/log/%s/%s' % (PROJECT_DIR, job_name, time_stamp()) if not is_test: os.makedirs(log_dir, exist_ok=True) # path settings anno_dir = '/extdata6/Minwoo/projects/repr-gene/results/genome-anno/%s' % GENOME_VER mirna_list_path = '%s/data/cons_mature_human_list.txt' % PROJECT_DIR target_site_dir = '%s/results/target-sites/%s' % (PROJECT_DIR, GENOME_VER) mirna_ts_bed_dir = '%s/bed' % target_site_dir mirna_ts_data_dir = '%s/peak-data' % target_site_dir # a result directory if not os.path.isdir(anno_dir): eprint('[ERROR] in %s' % caller_file_and_line()) sys.exit( '\t\'%s\' does not exist. Run repr-gene/03_anno_genome.py first.' % anno_dir) if not os.path.isdir(mirna_ts_bed_dir): eprint('[ERROR] in %s' % caller_file_and_line()) sys.exit('\t\'%s\' do not exist. Run 02_target_site.py first.' % mirna_ts_bed_dir) os.makedirs(mirna_ts_data_dir, exist_ok=True) with open(mirna_list_path, 'r') as mirna_list_file: mirnas = mirna_list_file.read().splitlines() chroms = chrom_list() # Target sites of an individual miRNA for mirna in mirnas: mirna_ts_bed_path = '%s/%s.bed' % (mirna_ts_bed_dir, mirna) assert os.path.isfile(mirna_ts_bed_path) cmd = '' cmd_cnt = 0 cmd_idx = 1 for chrom in chroms: cmd += '%s parse_peak_file %s %s %s %s;' % \ (script, mirna_ts_data_dir, mirna_ts_bed_path, anno_dir, chrom) cmd_cnt += 1 if cmd_cnt == 4: # one job for 4 chromosomes if is_test: print(cmd) else: one_job_name = '%s.%s.%s' % (job_name, mirna, cmd_idx) one_log_path = '%s/%s.txt' % (log_dir, one_job_name) os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' % (cmd, one_log_path, queue, one_job_name)) # reset cmd = '' cmd_cnt = 0 cmd_idx += 1 # Parsing merged miRNA target sites merged_mirna_bed_path = '%s/high_cons_mir.bed' % mirna_ts_bed_dir assert os.path.isfile(merged_mirna_bed_path) for chrom in chroms: cmd = '%s parse_peak_file %s %s %s %s' % \ (script, mirna_ts_data_dir, merged_mirna_bed_path, anno_dir, chrom) if is_test: print(cmd) else: one_job_name = '%s.%s.%s' % (job_name, 'All', chrom) one_log_path = '%s/%s.txt' % (log_dir, one_job_name) os.system('echo "%s" | qsub -j oe -o %s -q %s -N %s' % (cmd, one_log_path, queue, job_name))
def anno_peak(anno_dir, peak, chr_size): """ Read data about the genic regions of the peak and save the data on the object for the peak :param anno_dir: a directory genome annotation data is saved. :param peak: a 'RBPPeak' object :param chr_size: a size of chromosome ID same with the 'chrom' of the peak this parameter is necessary to know the end position of the last chromosomal fragment. """ assert peak.__class__.__name__ == 'MutPeak' peak_start, peak_end = peak.get_position() assert peak_end <= chr_size if peak.strand == '+': peak_strand = 'top' else: # '-' peak_strand = 'btm' bin_size = 1000000 start_digit = int(peak_start / bin_size) # Most significant digit end_digit = int(peak_end / bin_size) digit_diff = end_digit - start_digit # Split the peak with consideration of the bin sizes of the genome annotation files peak_frags = [] # A list of tuple (peak_frag_start, peak_frag_end) peak_frag_start = peak_start peak_frag_end = start_digit * bin_size for i in range(digit_diff): peak_frag_end += bin_size peak_frags.append((peak_frag_start, peak_frag_end)) peak_frag_start = peak_frag_end peak_frag_end = peak_end peak_frags.append((peak_frag_start, peak_frag_end)) # Read the array of genic regions and concatenates them chr_anno_dir = '%s/%s' % (anno_dir, peak.chrom) if not os.path.isdir(anno_dir): eprint('[ERROR] in %s: %s does not exist.' % (caller_file_and_line(), anno_dir)) sys.exit() total_frag_len = 0 anno_val_arr = [] for peak_frag_start, peak_frag_end in peak_frags: # Chromosomal bins: 0-1000000, 1000000-2000000, ... chr_bin_start = int(peak_frag_start / bin_size) * bin_size chr_bin_end = chr_bin_start + bin_size if chr_bin_end > chr_size: chr_bin_end = chr_size if peak_frag_end < chr_bin_end: frag_len = peak_frag_end - peak_frag_start else: frag_len = chr_bin_end - peak_frag_start total_frag_len += frag_len frag_anno_val_arr = array.array('i', []) arr_item_size = frag_anno_val_arr.itemsize anno_file = open( '%s/%d_%d_%s.dat' % (chr_anno_dir, chr_bin_start, chr_bin_end, peak_strand), 'rb') anno_file.seek((peak_frag_start - chr_bin_start) * arr_item_size) # relative bin peak_start position frag_anno_val_arr.fromfile(anno_file, frag_len) anno_file.close() anno_val_arr += frag_anno_val_arr peak.gene_based_anno(list(anno_val_arr)) assert total_frag_len == (peak_end - peak_start)
import re # input path settings mirbase_file_path = '/extdata6/Minwoo/data/miRNA/mature.fa' targetscan_fam_info_path = '/extdata6/Minwoo/data/miRNA/miR_Family_Info.txt' # result path settings pre_proc_dir = '%s/data' % PROJECT_DIR os.makedirs(pre_proc_dir, exist_ok=True) human_mir_file_path = '%s/mature_human.fa' % pre_proc_dir human_mir_list_path = '%s/mature_human_list.txt' % pre_proc_dir cons_human_mir_list_path = '%s/cons_mature_human_list.txt' % pre_proc_dir # conserved miRNAs # Parse the mature.fa eprint('[LOG] Extract human miRNAs from mirBase') mirna_info = [] # element: (header, seq) with open(mirbase_file_path, 'r') as mirbase_file: while True: header = mirbase_file.readline() if not header: # EOF break seq = mirbase_file.readline() mirna_info.append((header.strip(), seq.strip())) eprint('[LOG] The number of miRNAs from mirBase: %d' % len(mirna_info)) # Collect only human miRNAs with an annotation number less than 1000