def get_transcript_from_gene_pred(in_gene_pred, bam_fn=""): trans = dd(lambda: []) ut.err_format_time('get_transcript_from_bed12', 'Loading transcript from {} ... '.format(in_gene_pred)) header_ele = [ 'transID', 'chrom', 'strand', 'transStart', 'transEnd', 'cdsStart', 'cdsEnd', 'blockCount', 'exonStarts', 'exonEnds', 'score', 'geneID', 'cdsStartStatus', 'cdsEndStatus', 'exonFrame' ] pred_header = {header_ele[i]: i for i in range(len(header_ele))} # bam = ps.AlignmentFile(bam_fn) if bam_fn else None with open(in_gene_pred, 'r') as pred: for line in pred: line = line.rstrip() if len(line) < 1: continue if line.startswith('#'): continue ele = line.rsplit('\t') # chrom = ele[bed_header['chrom']] # strand = ele[bed_header['strand']] start_array, end_array = ele[pred_header['exonStarts']].split( ','), ele[pred_header['exonEnds']].split(',') gene_id = ele[pred_header['geneID']] if '' in start_array: start_array.remove('') if '' in end_array: end_array.remove('') exon_start = [int(i) + 1 for i in start_array] exon_end = [int(i) for i in end_array] # tid = bam.get_tid(chrom) if bam else chrom # is_rev = strand == '-' coor = [] for s, e in zip(exon_start, exon_end): coor.extend([s, e]) trans[gene_id].append('_'.join(map(str, coor))) ut.err_format_time('get_transcript_from_gene_pred', 'Loading transcript from {} done!'.format(in_gene_pred)) # if bam: bam.close() return trans
def bam_classify(in_bam_fn, high_bam_fn, low_bam_fn, high_max_ratio=high_max_ratio, high_min_ratio=high_min_ratio, high_iden_ratio=high_iden_ratio, high_repeat_ratio=high_repeat_ratio, low_repeat_ratio=low_repeat_ratio): ut.err_format_time('classify_bam_core', 'Processing {} ... '.format(in_bam_fn)) with ps.AlignmentFile(in_bam_fn) as in_bam, ps.AlignmentFile(high_bam_fn, 'wb', template=in_bam) as high_bam, \ ps.AlignmentFile(low_bam_fn, 'wb', template=in_bam) as low_bam: cnt = 0 r_array = [] r_name = '' for r in in_bam: if pb.is_unmapped(r): continue if r.query_name != r_name: cla_record(r_array, high_bam, low_bam, high_max_ratio, high_min_ratio, high_iden_ratio, high_repeat_ratio, low_repeat_ratio) r_name = r.query_name r_array = [r] else: r_array.append(r) cnt += 1 if cnt % 100000 == 0: ut.err_format_time('classify_bam_core', '{} BAM records done ... '.format(cnt)) if r_array: cla_record(r_array, high_bam, low_bam, high_max_ratio, high_min_ratio, high_iden_ratio, high_repeat_ratio, low_repeat_ratio) ut.err_format_time('classify_bam_core', 'Processing {} done.'.format(in_bam_fn)) return high_bam_fn, low_bam_fn
def isocirc_comp_core(args): a_fn = args.a_input a_type = args.a_type a_dict = get_input(a_fn, a_type, args.detailed) b_fn = args.b_input b_type = args.b_type b_dict = get_input(b_fn, b_type, args.detailed) if args.overlap: ut.err_format_time('get_overlap', '{} {} ... '.format(a_fn, b_fn)) if args.detailed: get_detailed_overlap(a_dict, b_dict, args.back_dis, args.inter_dis, args.overlap_entry, args.overlap) else: get_overlap(a_dict, b_dict, args.back_dis, args.overlap_entry, args.overlap) ut.err_format_time('get_overlap', '{} {} done!'.format(a_fn, b_fn)) if args.a_only: ut.err_format_time('get_A_only', '{} {} ... '.format(a_fn, b_fn)) get_only(a_dict, b_dict, args.back_dis, args.a_only) ut.err_format_time('get_A_only', '{} {} done!'.format(a_fn, b_fn)) if args.b_only: ut.err_format_time('get_B_only', '{} {} ... '.format(a_fn, b_fn)) get_only(b_dict, a_dict, args.back_dis, args.b_only) ut.err_format_time('get_B_only', '{} {} done!'.format(a_fn, b_fn)) return
def stats_core(long_read_len, cons_info, cons_bam, isoform_out, all_bsj_stats_dict, stats_out): # basic stats of read/cons tot_read_n, tot_base, tot_read_cons_n, tot_cons_n, tot_cons_base, tot_map_cons_n, tot_map_cons_base = 0, 0, 0, 0, 0, 0, 0 with open(long_read_len, 'r') as in_fp: for line in in_fp: [name, read_len] = line.rsplit() read_len = int(read_len) tot_read_n += 1 tot_base += read_len tot_map_read_n, tot_map_cons_n, tot_map_cons_base = get_map_stats(cons_bam) with open(cons_info) as in_fp: cons_names = dict() for line in in_fp: ele = line.rsplit() cons_name = ele[0].rsplit('_cons')[0] cons_names[cons_name] = 1 tot_cons_n += 1 tot_cons_base += int(ele[2]) tot_read_cons_n = len(cons_names) # detailed stats of circRNA # tot_known_circRNA tot_isoform, tot_bsj, tot_known_bsj, tot_circRNA_read_n, tot_iso_with_known_bsj, tot_known_bsj_read_n, tot_iso_with_cano_bsj, tot_cano_bsj_read_n = 0, 0, dd( lambda: 0), 0, 0, 0, 0, 0 tot_iso_with_cano_sj, tot_read_with_cano_sj = 0, 0 tot_iso_with_high_sj, tot_iso_with_known_ss, tot_iso_with_high_sj_known_ss = 0, 0, 0 tot_read_with_high_sj, tot_read_with_known_ss, tot_read_with_high_sj_known_ss = 0, 0, 0 tot_full_iso, tot_full_read = 0, 0 tot_full_iso_bsj_fsm_iso, tot_full_iso_bsj_fsm_read, tot_full_iso_bsj_nic_iso, tot_full_iso_bsj_nic_read, tot_full_iso_bsj_nnc_iso, tot_full_iso_bsj_nnc_read = 0, 0, 0, 0, 0, 0 tot_full_iso_int_fsm_iso, tot_full_iso_int_fsm_read, tot_full_iso_int_nic_iso, tot_full_iso_int_nic_read, tot_full_iso_int_nnc_iso, tot_full_iso_int_nnc_read = 0, 0, 0, 0, 0, 0 bsj_dict = dict() iso_dict = dict() full_iso = dict() # non_full_iso = dict() with open(isoform_out) as in_fp: for line in in_fp: if line.startswith('#'): continue ele = line.rsplit() read_cnt = int(ele[idx['readCount']]) bsj = (ele[idx['chrom']], ele[idx['startCoor0based']], ele[idx['endCoor']]) if bsj not in bsj_dict: bsj_dict[bsj] = 1 tot_bsj += 1 for i, known_bsj in enumerate( ele[idx['isKnownBSJ']].rsplit(',')): tot_known_bsj[i] += (known_bsj == 'True') tot_known_bsj[i + 1] += ('False' not in ele[idx['isKnownBSJ']]) tot_circRNA_read_n += read_cnt iso = (ele[idx['chrom']], ele[idx['startCoor0based']], ele[idx['endCoor']], ele[idx['blockCount']], ele[idx['blockSize']], ele[idx['blockStarts']]) # is_full = ele[idx['isFullLength']] == 'True' # if is_full and iso in non_full_iso: # print('Full\t{}'.format(ele[0])) # if not is_full and iso in full_iso: # print('Non-full\t{}'.format(ele[0])) # if is_full: full_iso[iso] = 1 # else: non_full_iso[iso] = 1 if iso not in iso_dict: isoform_inc_cnt = 1 iso_dict[iso] = 1 else: isoform_inc_cnt = 0 tot_isoform += isoform_inc_cnt if 'False' not in ele[idx['isKnownSS']]: tot_read_with_known_ss += read_cnt tot_iso_with_known_ss += isoform_inc_cnt # if 'False' not in ele[idx['isCanoSJ']]: # tot_iso_with_cano_sj += isoform_inc_cnt # tot_read_with_cano_sj += read_cnt if 'False' not in ele[idx['isHighFSJ']]: tot_iso_with_high_sj += isoform_inc_cnt tot_read_with_high_sj += read_cnt if 'False' not in ele[idx['isKnownSS']]: tot_iso_with_high_sj_known_ss += isoform_inc_cnt tot_read_with_high_sj_known_ss += read_cnt if ele[idx['isFullLength']] == 'True': full_iso[iso] = 1 tot_full_iso += isoform_inc_cnt tot_full_read += read_cnt if ele[idx['BSJCate']] == 'FSM': tot_full_iso_bsj_fsm_iso += isoform_inc_cnt tot_full_iso_bsj_fsm_read += read_cnt elif ele[idx['BSJCate']] == 'NIC': tot_full_iso_bsj_nic_iso += isoform_inc_cnt tot_full_iso_bsj_nic_read += read_cnt else: tot_full_iso_bsj_nnc_iso += isoform_inc_cnt tot_full_iso_bsj_nnc_read += read_cnt if ele[idx['FSJCate']] == 'FSM': tot_full_iso_int_fsm_iso += isoform_inc_cnt tot_full_iso_int_fsm_read += read_cnt elif ele[idx['FSJCate']] == 'NIC': tot_full_iso_int_nic_iso += isoform_inc_cnt tot_full_iso_int_nic_read += read_cnt else: tot_full_iso_int_nnc_iso += isoform_inc_cnt tot_full_iso_int_nnc_read += read_cnt ut.err_format_time('basic_stats_core', 'Writing basic stats to file ... ') with open(stats_out, 'w') as out: out.write('#' + __program__ + '\t' + __version__ + '\n') out.write( '# cons=consensus sequence, high=high-confidence, cano=canonical, BSJ=back-splice junction, FSJ=forward-splice junction, SS=splice site\n' ) out.write('1_Total_reads\t{:,}\n'.format(tot_read_n)) # out.write('Total_base\t{:,}\n'.format(tot_base)) out.write('2_Total_reads_with_cons\t{:,}\n'.format(tot_read_cons_n)) out.write( '3_Total_mappable_reads_with_cons\t{:,}\n'.format(tot_map_read_n)) out.write('4_Total_reads_with_candidate_BSJs\t{:,}\n'.format( all_bsj_stats_dict['read_with_bsj_n'])) out.write('5_Total_candidate_BSJs\t{:,}\n'.format( all_bsj_stats_dict['bsj_n'])) if len(all_bsj_stats_dict['known_bsj_n']) > 2: for i in all_bsj_stats_dict['known_bsj_n']: bsj_idx = 'all' if i == len( all_bsj_stats_dict['known_bsj_n']) - 1 else i out.write('6_Total_candidate_BSJs_known_in_{}\t{:,}\n'.format( bsj_idx, all_bsj_stats_dict['known_bsj_n'][i])) else: out.write('6_Total_known_candidate_BSJs\t{:,}\n'.format( all_bsj_stats_dict['known_bsj_n'][0])) out.write('7_Total_reads_with_high_confidence_BSJs\t{:,}\n'.format( tot_circRNA_read_n)) out.write('8_Total_high_confidence_BSJs\t{:,}\n'.format(tot_bsj)) if len(tot_known_bsj) > 2: for i in tot_known_bsj: bsj_idx = 'all' if i == len(tot_known_bsj) - 1 else i out.write( '9_Total_high_confidence_BSJs_known_in_{}\t{:,}\n'.format( bsj_idx, tot_known_bsj[i])) else: out.write('9_Total_known_high_confidence_BSJs\t{:,}\n'.format( tot_known_bsj[0])) out.write( '10_Total_isoforms_with_high_BSJs\t{:,}\n'.format(tot_isoform)) # out.write('11_Total_isoforms_with_high_BSJs_cano_SJs\t{:,}\n'.format(tot_iso_with_cano_sj)) out.write('11_Total_isoforms_with_high_BSJs_high_FSJs\t{:,}\n'.format( tot_iso_with_high_sj)) out.write('12_Total_isoforms_with_high_BSJ_known_SSs\t{:,}\n'.format( tot_iso_with_known_ss)) out.write( '13_Total_isoforms_with_high_BSJs_high_FSJs_known_SSs\t{:,}\n'. format(tot_iso_with_high_sj_known_ss)) out.write('14_Total_full_length_isoforms\t{:,}\n'.format( len(full_iso))) #tot_full_iso)) out.write('15_Total_reads_for_full_length_isoforms\t{:,}\n'.format( tot_full_read)) # FSM/NIC/NNC out.write('16_Total_full_length_isoforms_with_FSM_BSJ\t{:,}\n'.format( tot_full_iso_bsj_fsm_iso)) out.write( '17_Total_reads_for_full_length_isoforms_with_FSM_BSJ\t{:,}\n'. format(tot_full_iso_bsj_fsm_read)) out.write('18_Total_full_length_isoforms_with_NIC_BSJ\t{:,}\n'.format( tot_full_iso_bsj_nic_iso)) out.write( '19_Total_reads_for_full_length_isoforms_with_NIC_BSJ\t{:,}\n'. format(tot_full_iso_bsj_nic_read)) out.write('20_Total_full_length_isoforms_with_NNC_BSJ\t{:,}\n'.format( tot_full_iso_bsj_nnc_iso)) out.write( '21_Total_reads_for_full_length_isoforms_with_NNC_BSJ\t{:,}\n'. format(tot_full_iso_bsj_nnc_read)) out.write('22_Total_full_length_isoform_with_FSM_FSJ\t{:,}\n'.format( tot_full_iso_int_fsm_iso)) out.write( '23_Total_reads_full_length_isoforms_with_FSM_FSJ\t{:,}\n'.format( tot_full_iso_int_fsm_read)) out.write('24_Total_full_length_isoforms_with_NIC_FSJ\t{:,}\n'.format( tot_full_iso_int_nic_iso)) out.write( '25_Total_reads_for_full_length_isoforms_with_NIC_FSJ\t{:,}\n'. format(tot_full_iso_int_nic_read)) out.write('26_Total_full_length_isoforms_with_NNC_FSJ\t{:,}\n'.format( tot_full_iso_int_nnc_iso)) out.write( '27_Total_reads_for_full_length_isoforms_with_NNC_FSJ\t{:,}\n'. format(tot_full_iso_int_nnc_read)) ut.err_format_time('basic_stats_core', 'Writing basic stats to file done!')
def restore_gff_db(gtf_fn): gtf_db = None if gtf_fn is not None: gtf_db_fn = gtf_fn + '.gffdb' if not os.path.isfile(gtf_db_fn): try: # check if 'gene' or 'transcript' is in GTF disable_gene, disable_trans = False, False with open(gtf_fn) as fp: l = 0 for line in fp: line = line.rstrip() if len(line) < 1: continue if line[0] != '#': if line.split()[2] == 'gene': disable_gene = True elif line.split()[2] == 'transcript': disable_trans = True l += 1 if (disable_gene and disable_trans) or l == 100: break ut.err_format_time( 'restore_gtf_db', 'Creating GTF databases for {} ...'.format(gtf_fn)) gtf_db = gu.create_db(gtf_fn, gtf_db_fn, disable_infer_genes=disable_gene, disable_infer_transcripts=disable_trans) ut.err_format_time( 'restore_gtf_db', 'Creating GTF databases for {} done!'.format(gtf_fn)) except: ut.err_format_time( 'restore_gtf_db', 'Error in parsing {}\nCheck if annotation file format is correct' .format(gtf_fn)) sys.exit(IOError) else: try: ut.err_format_time( 'restore_gtf_db', 'Retrieving gff database for {} ...'.format(gtf_fn)) gtf_db = gu.FeatureDB(gtf_db_fn) ut.err_format_time( 'restore_gtf_db', 'Retrieving gff database for {} done!'.format(gtf_fn)) except: ut.err_format_time( 'restore_gtf_db', 'Error in parsing {}\nTry to remove this db file and re-run' .format(gtf_db_fn)) sys.exit(IOError) return gtf_db