def annotate_fusion(ref_f, input_f, output_f): """ Align fusion juncrions to gene annotations """ print('Start to annotate fusion junctions...') genes, gene_info = parse_ref1(ref_f) # gene annotations fusions, fusion_index = parse_bed(input_f) # fusion junctions total = set() with open(output_f, 'w') as outf: for chrom in genes: # overlap gene annotations with fusion juncrions result = Interval.overlapwith(genes[chrom].interval, fusions[chrom]) for itl in result: # extract gene annotations iso = list(filter(lambda x: x.startswith('iso'), itl[2:])) # for each overlapped fusion junction for fus in itl[(2 + len(iso)):]: reads = fus.split()[1] fus_start, fus_end = fusion_index[fus] edge_annotations = [] # first or last exon flag for iso_id in iso: g, i, c, s = iso_id.split()[1:] start = gene_info[iso_id][0][0] end = gene_info[iso_id][-1][-1] # fusion junction excesses boundaries of gene # annotation if fus_start < start - 10 or fus_end > end + 10: continue (fusion_info, index, edge) = map_fusion_to_iso(fus_start, fus_end, s, gene_info[iso_id]) if fusion_info: fus_start_str = str(fus_start) fus_end_str = str(fus_end) bed_info = '\t'.join([chrom, fus_start_str, fus_end_str, 'FUSIONJUNC/%s' % reads, '0', s, fus_start_str, fus_start_str, '0,0,0']) bed = '\t'.join([bed_info, fusion_info, g, i, index]) if not edge: # not first or last exon outf.write(bed + '\n') total.add(fus) else: # first or last exon edge_annotations.append(bed) if edge_annotations: # first or last exon for bed in edge_annotations: outf.write(bed + '\n') total.add(fus) print('Annotated %d fusion junctions!' % len(total))
def annotate_fusion(ref_f, input_f, output_f): """ Align fusion juncrions to gene annotations """ print('Start to annotate fusion junctions...') genes, gene_info = parse_ref1(ref_f) # gene annotations fusions, fusion_index = parse_bed(input_f) # fusion junctions total = set() with open(output_f, 'w') as outf: for chrom in genes: # overlap gene annotations with fusion juncrions result = Interval.overlapwith(genes[chrom].interval, fusions[chrom]) for itl in result: # extract gene annotations iso = list(filter(lambda x: x.startswith('iso'), itl[2:])) # for each overlapped fusion junction for fus in itl[(2 + len(iso)):]: reads = fus.split()[1] fus_start, fus_end = fusion_index[fus] edge_annotations = [] # first or last exon flag for iso_id in iso: g, i, c, s = iso_id.split()[1:] start = gene_info[iso_id][0][0] end = gene_info[iso_id][-1][-1] # fusion junction excesses boundaries of gene # annotation if fus_start < start - 10 or fus_end > end + 10: continue (fusion_info, index, edge) = map_fusion_to_iso(fus_start, fus_end, s, gene_info[iso_id]) if fusion_info: fus_start_str = str(fus_start) fus_end_str = str(fus_end) bed_info = '\t'.join([ chrom, fus_start_str, fus_end_str, 'FUSIONJUNC/%s' % reads, '0', s, fus_start_str, fus_start_str, '0,0,0' ]) bed = '\t'.join( [bed_info, fusion_info, g, i, index]) if not edge: # not first or last exon outf.write(bed + '\n') total.add(fus) else: # first or last exon edge_annotations.append(bed) if edge_annotations: # first or last exon for bed in edge_annotations: outf.write(bed + '\n') total.add(fus) print('Annotated %d fusion junctions!' % len(total))
def annotate_fusion(ref_f, junc_bed, secondary_flag=0, denovo_flag=0): """ Align fusion juncrions to gene annotations """ print('Start to annotate fusion junctions...') # gene annotations genes, novel_genes, gene_info, chrom_info = parse_ref(ref_f, 1) fusion_bed = junc_bed fusions, fusion_index = parse_bed(fusion_bed) # fusion junctions total = set() annotated_fusion_f = 'annotated_fusion.txt.tmp' with open(annotated_fusion_f, 'w') as outf: for chrom in chrom_info: # overlap gene annotations with fusion juncrions result = [] # overlap genes if chrom in genes: result += Interval.overlapwith(genes[chrom].interval, fusions[chrom]) # overlap novel genes in denovo mode if denovo_flag and chrom in novel_genes: result += Interval.overlapwith(novel_genes[chrom].interval, fusions[chrom]) for itl in result: # extract gene annotations iso = list([x for x in itl[2:] if x.startswith('iso')]) # for each overlapped fusion junction for fus in itl[(2 + len(iso)):]: reads = fus.split()[1] fus_start, fus_end = fusion_index[fus] fus_loc = '%s\t%d\t%d\tFUSIONJUNC/%s' % (chrom, fus_start, fus_end, reads) edge_annotations = [] # first or last exon flag secondary_exon = defaultdict(dict) # secondary exons annotate_flag = 0 for iso_id in iso: g, i, c, s = iso_id.split()[1:] start = gene_info[iso_id][0][0] end = gene_info[iso_id][-1][-1] # fusion junction excesses boundaries of gene # annotation if fus_start < start - 10 or fus_end > end + 10: if not secondary_flag: continue (fusion_info, index, edge, secondary) = map_fusion_to_iso(fus_start, fus_end, s, gene_info[iso_id]) if fusion_info: annotate_flag += 1 bed_info = '\t'.join([fus_loc, '0', s, str(fus_start), str(fus_start), '0,0,0']) bed = '\t'.join([bed_info, fusion_info, g, i, index]) if not edge: # not first or last exon outf.write(bed + '\n') total.add(fus) else: # first or last exon edge_annotations.append(bed) elif secondary_flag and secondary is not None: li, ri = secondary gene = ':'.join([g, s]) if li is not None: li = str(li) secondary_exon['left'][gene] = ':'.join([i, li]) if ri is not None: ri = str(ri) secondary_exon['right'][gene] = ':'.join([i, ri]) if edge_annotations: for bed in edge_annotations: outf.write(bed + '\n') total.add(fus) if secondary_flag and not annotate_flag: for gene in secondary_exon['left']: if gene in secondary_exon['right']: left = secondary_exon['left'][gene] right = secondary_exon['right'][gene] g, s = gene.split(':') # for avoid dup, use fus_loc_new fus_loc_new = fus_loc + '\t0\t%s' % s outf.write('%s\t%s:%s\t%s:%s\n' % (fus_loc_new, g, left, g, right)) print('Annotated %d fusion junctions!' % len(total))
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir, output_dir): """ Check each intron and fetch PIR Modified from Braunschweig et al., Genome Research, 2014, gr-177790. """ print('Start to parse circular RNA introns...') # set path fusion_f = '%s/circularRNA_full.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' pAminus_junc = parse_junc(pAminus_junc_f) pAminus_bam_f = tophat_dir + '/accepted_hits.bam' pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb') pAplus_junc_f = '%s/junctions.bed' % pAplus_dir pAplus_junc = parse_junc(pAplus_junc_f) pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb') excluded_region = defaultdict(list) novel_region = defaultdict(list) intron = defaultdict(list) intron_list = set() intron_info_list = {} with open(fusion_f, 'r') as f: for line in f: chrom, start, end = line.split()[:3] start = int(start) end = int(end) strand = line.split()[5] circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs excluded_region[chrom].append([start, end]) continue sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] reads = line.split()[12] gene, iso = line.split()[14:16] for s, o in zip(sizes, offsets): if gene.startswith('CUFF'): novel_region[chrom].append([start + o, start + o + s]) else: excluded_region[chrom].append([start + o, start + o + s]) if gene.startswith('CUFF'): # only check annotated introns continue num = int(line.split()[9]) for i in range(num - 1): sta = start + offsets[i] + sizes[i] end = start + offsets[i + 1] if end - sta == 0: continue intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand) if intron_info in intron_list: if int(reads) > int(intron_info_list[intron_info][2]): intron_info_list[intron_info] = [gene, iso, reads] continue intron[chrom].append([sta, end, intron_info]) intron_list.add(intron_info) intron_info_list[intron_info] = [gene, iso, reads] intron_set = set() for chrom in excluded_region: intron_region = [] # retain introns covered by novel assembled transcripts # combined_region = Interval(novel_region[chrom]).interval # for region in Interval.overlapwith(combined_region, intron[chrom]): # retain all intron regions in this step for region in intron[chrom]: if len(region) >= 3: for intron_info in region[2:]: chrom, start, end = intron_info.split()[:3] intron_region.append([int(start), int(end), intron_info]) intron_set.add(intron_info) # remove introns overlapped with annotated exons combined_region = Interval(excluded_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron_region): if len(region) >= 3: for intron_info in region[2:]: intron_set.discard(intron_info) output_f = '%s/all_intron_info.txt' % output_dir # import pdb;pdb.set_trace() with open(output_f, 'w') as output: total_i_n = len(intron_set) finished_n = 0 for intron in intron_set: chrom, sta, end, strand = intron.split() intron_info = '\t'.join([chrom, sta, end]) sta = int(sta) end = int(end) # fetch junctions for circular RNAs circ_junc_read = pAminus_junc[intron_info] circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8) circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8) circ_ri_read = circ_left_read + circ_right_read circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0) # calculate PIR for circular RNAs if circ_ri_read == 0 and circ_junc_read == 0: pir_circ = 0 else: pir_circ = 100.0 * circ_ri_read / (circ_ri_read + 2 * circ_junc_read) # exact binomial test for circular RNAs m = min(circ_left_read, circ_right_read, circ_intron_read) n = m + max(circ_left_read, circ_right_read, circ_intron_read) p = 1 / 3.5 p1 = binom.cdf(m, n, p) # one-side binomial test # fetch junctions for linear RNAs linear_junc_read = pAplus_junc[intron_info] linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8) linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8) linear_ri_read = linear_left_read + linear_right_read linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end, flag=0) # calculate PIR for linear RNAs if linear_ri_read == 0 and linear_junc_read == 0: pir_linear = 0 else: pir_linear = 100.0 * linear_ri_read / (linear_ri_read + linear_junc_read * 2) # exact binomial test for linear RNAs m = min(linear_left_read, linear_right_read, linear_intron_read) n = m + max(linear_left_read, linear_right_read, linear_intron_read) p = 1 / 3.5 p2 = binom.cdf(m, n, p) # one-side binomial test info = '\t'.join(str(round(float(x), 3)) for x in (pir_circ, pir_linear, p1, p2, circ_ri_read, circ_junc_read, circ_intron_read, linear_ri_read, linear_junc_read, linear_intron_read)) other_info = '\t'.join(intron_info_list[intron]) output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0', strand, other_info, info])) output.write('\n') finished_n += 1 sys.stdout.write("Progress: %d/%d \r" % (finished_n, total_i_n) ) sys.stdout.flush() print('Complete parsing circular RNA introns!')
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir): """ Check each intron and fetch PIR Modified from Braunschweig et al., Genome Research, 2014, gr-177790. """ print('Start to parse circular RNA introns...') # set path fusion_f = '%s/circ_fusion.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' pAminus_junc = parse_junc(pAminus_junc_f) pAminus_bam_f = tophat_dir + '/accepted_hits.bam' pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb') pAplus_junc_f = '%s/junctions.bed' % pAplus_dir pAplus_junc = parse_junc(pAplus_junc_f) pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb') excluded_region = defaultdict(list) novel_region = defaultdict(list) intron = defaultdict(list) intron_list = set() intron_info_list = {} with open(fusion_f, 'r') as f: for line in f: chrom, start, end = line.split()[:3] start = int(start) end = int(end) strand = line.split()[5] circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs excluded_region[chrom].append([start, end]) continue sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] reads = line.split()[12] gene, iso = line.split()[14:16] for s, o in zip(sizes, offsets): if gene.startswith('CUFF'): novel_region[chrom].append([start + o, start + o + s]) else: excluded_region[chrom].append([start + o, start + o + s]) if gene.startswith('CUFF'): # only check annotated introns continue num = int(line.split()[9]) for i in range(num - 1): sta = start + offsets[i] + sizes[i] end = start + offsets[i + 1] if end - sta == 0: continue intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand) if intron_info in intron_list: if int(reads) > int(intron_info_list[intron_info][2]): intron_info_list[intron_info] = [gene, iso, reads] continue intron[chrom].append([sta, end, intron_info]) intron_list.add(intron_info) intron_info_list[intron_info] = [gene, iso, reads] intron_set = set() for chrom in excluded_region: intron_region = [] # retain introns covered by novel assembled transcripts combined_region = Interval(novel_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron[chrom]): if len(region) >= 3: for intron_info in region[2:]: chrom, start, end = intron_info.split()[:3] intron_region.append([int(start), int(end), intron_info]) intron_set.add(intron_info) # remove introns overlapped with annotated exons combined_region = Interval(excluded_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron_region): if len(region) >= 3: for intron_info in region[2:]: intron_set.discard(intron_info) output_f = '%s/all_intron_info.txt' % denovo_dir with open(output_f, 'w') as output: for intron in intron_set: chrom, sta, end, strand = intron.split() intron_info = '\t'.join([chrom, sta, end]) sta = int(sta) end = int(end) # fetch junctions for circular RNAs circ_junc_read = pAminus_junc[intron_info] circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8) circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8) circ_ri_read = circ_left_read + circ_right_read circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0) # calculate PIR for circular RNAs if circ_ri_read == 0 and circ_junc_read == 0: pir_circ = 0 else: pir_circ = 100.0 * circ_ri_read / (circ_ri_read + 2 * circ_junc_read) # exact binomial test for circular RNAs m = min(circ_left_read, circ_right_read, circ_intron_read) n = m + max(circ_left_read, circ_right_read, circ_intron_read) p = 1 / 3.5 p1 = binom.cdf(m, n, p) # one-side binomial test # fetch junctions for linear RNAs linear_junc_read = pAplus_junc[intron_info] linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8) linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8) linear_ri_read = linear_left_read + linear_right_read linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end, flag=0) # calculate PIR for linear RNAs if linear_ri_read == 0 and linear_junc_read == 0: pir_linear = 0 else: pir_linear = 100.0 * linear_ri_read / (linear_ri_read + linear_junc_read * 2) # exact binomial test for linear RNAs m = min(linear_left_read, linear_right_read, linear_intron_read) n = m + max(linear_left_read, linear_right_read, linear_intron_read) p = 1 / 3.5 p2 = binom.cdf(m, n, p) # one-side binomial test info = '\t'.join(str(round(x, 3)) for x in (pir_circ, pir_linear, p1, p2, circ_ri_read, circ_junc_read, circ_intron_read, linear_ri_read, linear_junc_read, linear_intron_read)) other_info = '\t'.join(intron_info_list[intron]) output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0', strand, other_info, info])) output.write('\n') print('Complete parsing circular RNA introns!')