Ejemplo n.º 1
0
def get_transcript_from_gene_pred(in_gene_pred, bam_fn=""):
    trans = dd(lambda: [])
    ut.err_format_time('get_transcript_from_bed12',
                       'Loading transcript from {} ... '.format(in_gene_pred))
    header_ele = [
        'transID', 'chrom', 'strand', 'transStart', 'transEnd', 'cdsStart',
        'cdsEnd', 'blockCount', 'exonStarts', 'exonEnds', 'score', 'geneID',
        'cdsStartStatus', 'cdsEndStatus', 'exonFrame'
    ]
    pred_header = {header_ele[i]: i for i in range(len(header_ele))}
    # bam = ps.AlignmentFile(bam_fn) if bam_fn else None
    with open(in_gene_pred, 'r') as pred:
        for line in pred:
            line = line.rstrip()
            if len(line) < 1: continue
            if line.startswith('#'): continue
            ele = line.rsplit('\t')
            # chrom = ele[bed_header['chrom']]
            # strand = ele[bed_header['strand']]
            start_array, end_array = ele[pred_header['exonStarts']].split(
                ','), ele[pred_header['exonEnds']].split(',')
            gene_id = ele[pred_header['geneID']]
            if '' in start_array: start_array.remove('')
            if '' in end_array: end_array.remove('')
            exon_start = [int(i) + 1 for i in start_array]
            exon_end = [int(i) for i in end_array]
            # tid = bam.get_tid(chrom) if bam else chrom
            # is_rev = strand == '-'
            coor = []
            for s, e in zip(exon_start, exon_end):
                coor.extend([s, e])
            trans[gene_id].append('_'.join(map(str, coor)))
    ut.err_format_time('get_transcript_from_gene_pred',
                       'Loading transcript from {} done!'.format(in_gene_pred))
    # if bam: bam.close()
    return trans
Ejemplo n.º 2
0
def bam_classify(in_bam_fn,
                 high_bam_fn,
                 low_bam_fn,
                 high_max_ratio=high_max_ratio,
                 high_min_ratio=high_min_ratio,
                 high_iden_ratio=high_iden_ratio,
                 high_repeat_ratio=high_repeat_ratio,
                 low_repeat_ratio=low_repeat_ratio):
    ut.err_format_time('classify_bam_core',
                       'Processing {} ... '.format(in_bam_fn))
    with ps.AlignmentFile(in_bam_fn) as in_bam, ps.AlignmentFile(high_bam_fn, 'wb', template=in_bam) as high_bam, \
            ps.AlignmentFile(low_bam_fn, 'wb', template=in_bam) as low_bam:
        cnt = 0
        r_array = []
        r_name = ''
        for r in in_bam:
            if pb.is_unmapped(r): continue
            if r.query_name != r_name:
                cla_record(r_array, high_bam, low_bam, high_max_ratio,
                           high_min_ratio, high_iden_ratio, high_repeat_ratio,
                           low_repeat_ratio)
                r_name = r.query_name
                r_array = [r]
            else:
                r_array.append(r)
            cnt += 1
            if cnt % 100000 == 0:
                ut.err_format_time('classify_bam_core',
                                   '{} BAM records done ... '.format(cnt))
        if r_array:
            cla_record(r_array, high_bam, low_bam, high_max_ratio,
                       high_min_ratio, high_iden_ratio, high_repeat_ratio,
                       low_repeat_ratio)
    ut.err_format_time('classify_bam_core',
                       'Processing {} done.'.format(in_bam_fn))
    return high_bam_fn, low_bam_fn
Ejemplo n.º 3
0
def isocirc_comp_core(args):
    a_fn = args.a_input
    a_type = args.a_type
    a_dict = get_input(a_fn, a_type, args.detailed)
    b_fn = args.b_input
    b_type = args.b_type
    b_dict = get_input(b_fn, b_type, args.detailed)
    if args.overlap:
        ut.err_format_time('get_overlap', '{} {} ... '.format(a_fn, b_fn))
        if args.detailed:
            get_detailed_overlap(a_dict, b_dict, args.back_dis, args.inter_dis,
                                 args.overlap_entry, args.overlap)
        else:
            get_overlap(a_dict, b_dict, args.back_dis, args.overlap_entry,
                        args.overlap)
        ut.err_format_time('get_overlap', '{} {} done!'.format(a_fn, b_fn))
    if args.a_only:
        ut.err_format_time('get_A_only', '{} {} ... '.format(a_fn, b_fn))
        get_only(a_dict, b_dict, args.back_dis, args.a_only)
        ut.err_format_time('get_A_only', '{} {} done!'.format(a_fn, b_fn))
    if args.b_only:
        ut.err_format_time('get_B_only', '{} {} ... '.format(a_fn, b_fn))
        get_only(b_dict, a_dict, args.back_dis, args.b_only)
        ut.err_format_time('get_B_only', '{} {} done!'.format(a_fn, b_fn))
    return
Ejemplo n.º 4
0
def stats_core(long_read_len, cons_info, cons_bam, isoform_out,
               all_bsj_stats_dict, stats_out):
    # basic stats of read/cons
    tot_read_n, tot_base, tot_read_cons_n, tot_cons_n, tot_cons_base, tot_map_cons_n, tot_map_cons_base = 0, 0, 0, 0, 0, 0, 0
    with open(long_read_len, 'r') as in_fp:
        for line in in_fp:
            [name, read_len] = line.rsplit()
            read_len = int(read_len)
            tot_read_n += 1
            tot_base += read_len
    tot_map_read_n, tot_map_cons_n, tot_map_cons_base = get_map_stats(cons_bam)
    with open(cons_info) as in_fp:
        cons_names = dict()
        for line in in_fp:
            ele = line.rsplit()
            cons_name = ele[0].rsplit('_cons')[0]
            cons_names[cons_name] = 1
            tot_cons_n += 1
            tot_cons_base += int(ele[2])
    tot_read_cons_n = len(cons_names)

    # detailed stats of circRNA
    # tot_known_circRNA
    tot_isoform, tot_bsj, tot_known_bsj, tot_circRNA_read_n, tot_iso_with_known_bsj, tot_known_bsj_read_n, tot_iso_with_cano_bsj, tot_cano_bsj_read_n = 0, 0, dd(
        lambda: 0), 0, 0, 0, 0, 0
    tot_iso_with_cano_sj, tot_read_with_cano_sj = 0, 0
    tot_iso_with_high_sj, tot_iso_with_known_ss, tot_iso_with_high_sj_known_ss = 0, 0, 0
    tot_read_with_high_sj, tot_read_with_known_ss, tot_read_with_high_sj_known_ss = 0, 0, 0
    tot_full_iso, tot_full_read = 0, 0
    tot_full_iso_bsj_fsm_iso, tot_full_iso_bsj_fsm_read, tot_full_iso_bsj_nic_iso, tot_full_iso_bsj_nic_read, tot_full_iso_bsj_nnc_iso, tot_full_iso_bsj_nnc_read = 0, 0, 0, 0, 0, 0
    tot_full_iso_int_fsm_iso, tot_full_iso_int_fsm_read, tot_full_iso_int_nic_iso, tot_full_iso_int_nic_read, tot_full_iso_int_nnc_iso, tot_full_iso_int_nnc_read = 0, 0, 0, 0, 0, 0
    bsj_dict = dict()
    iso_dict = dict()
    full_iso = dict()
    # non_full_iso = dict()
    with open(isoform_out) as in_fp:
        for line in in_fp:
            if line.startswith('#'): continue
            ele = line.rsplit()
            read_cnt = int(ele[idx['readCount']])
            bsj = (ele[idx['chrom']], ele[idx['startCoor0based']],
                   ele[idx['endCoor']])
            if bsj not in bsj_dict:
                bsj_dict[bsj] = 1
                tot_bsj += 1
                for i, known_bsj in enumerate(
                        ele[idx['isKnownBSJ']].rsplit(',')):
                    tot_known_bsj[i] += (known_bsj == 'True')
                tot_known_bsj[i + 1] += ('False' not in ele[idx['isKnownBSJ']])
            tot_circRNA_read_n += read_cnt

            iso = (ele[idx['chrom']], ele[idx['startCoor0based']],
                   ele[idx['endCoor']], ele[idx['blockCount']],
                   ele[idx['blockSize']], ele[idx['blockStarts']])
            # is_full = ele[idx['isFullLength']] == 'True'
            # if is_full and iso in non_full_iso:
            # print('Full\t{}'.format(ele[0]))
            # if not is_full and iso in full_iso:
            # print('Non-full\t{}'.format(ele[0]))
            # if is_full: full_iso[iso] = 1
            # else: non_full_iso[iso] = 1
            if iso not in iso_dict:
                isoform_inc_cnt = 1
                iso_dict[iso] = 1
            else:
                isoform_inc_cnt = 0
            tot_isoform += isoform_inc_cnt
            if 'False' not in ele[idx['isKnownSS']]:
                tot_read_with_known_ss += read_cnt
                tot_iso_with_known_ss += isoform_inc_cnt
            # if 'False' not in ele[idx['isCanoSJ']]:
            #     tot_iso_with_cano_sj += isoform_inc_cnt
            #     tot_read_with_cano_sj += read_cnt
            if 'False' not in ele[idx['isHighFSJ']]:
                tot_iso_with_high_sj += isoform_inc_cnt
                tot_read_with_high_sj += read_cnt
                if 'False' not in ele[idx['isKnownSS']]:
                    tot_iso_with_high_sj_known_ss += isoform_inc_cnt
                    tot_read_with_high_sj_known_ss += read_cnt
            if ele[idx['isFullLength']] == 'True':
                full_iso[iso] = 1
                tot_full_iso += isoform_inc_cnt
                tot_full_read += read_cnt
                if ele[idx['BSJCate']] == 'FSM':
                    tot_full_iso_bsj_fsm_iso += isoform_inc_cnt
                    tot_full_iso_bsj_fsm_read += read_cnt
                elif ele[idx['BSJCate']] == 'NIC':
                    tot_full_iso_bsj_nic_iso += isoform_inc_cnt
                    tot_full_iso_bsj_nic_read += read_cnt
                else:
                    tot_full_iso_bsj_nnc_iso += isoform_inc_cnt
                    tot_full_iso_bsj_nnc_read += read_cnt

                if ele[idx['FSJCate']] == 'FSM':
                    tot_full_iso_int_fsm_iso += isoform_inc_cnt
                    tot_full_iso_int_fsm_read += read_cnt
                elif ele[idx['FSJCate']] == 'NIC':
                    tot_full_iso_int_nic_iso += isoform_inc_cnt
                    tot_full_iso_int_nic_read += read_cnt
                else:
                    tot_full_iso_int_nnc_iso += isoform_inc_cnt
                    tot_full_iso_int_nnc_read += read_cnt

    ut.err_format_time('basic_stats_core', 'Writing basic stats to file ... ')
    with open(stats_out, 'w') as out:
        out.write('#' + __program__ + '\t' + __version__ + '\n')
        out.write(
            '# cons=consensus sequence, high=high-confidence, cano=canonical, BSJ=back-splice junction, FSJ=forward-splice junction, SS=splice site\n'
        )
        out.write('1_Total_reads\t{:,}\n'.format(tot_read_n))
        # out.write('Total_base\t{:,}\n'.format(tot_base))
        out.write('2_Total_reads_with_cons\t{:,}\n'.format(tot_read_cons_n))
        out.write(
            '3_Total_mappable_reads_with_cons\t{:,}\n'.format(tot_map_read_n))
        out.write('4_Total_reads_with_candidate_BSJs\t{:,}\n'.format(
            all_bsj_stats_dict['read_with_bsj_n']))
        out.write('5_Total_candidate_BSJs\t{:,}\n'.format(
            all_bsj_stats_dict['bsj_n']))
        if len(all_bsj_stats_dict['known_bsj_n']) > 2:
            for i in all_bsj_stats_dict['known_bsj_n']:
                bsj_idx = 'all' if i == len(
                    all_bsj_stats_dict['known_bsj_n']) - 1 else i
                out.write('6_Total_candidate_BSJs_known_in_{}\t{:,}\n'.format(
                    bsj_idx, all_bsj_stats_dict['known_bsj_n'][i]))
        else:
            out.write('6_Total_known_candidate_BSJs\t{:,}\n'.format(
                all_bsj_stats_dict['known_bsj_n'][0]))
        out.write('7_Total_reads_with_high_confidence_BSJs\t{:,}\n'.format(
            tot_circRNA_read_n))
        out.write('8_Total_high_confidence_BSJs\t{:,}\n'.format(tot_bsj))
        if len(tot_known_bsj) > 2:
            for i in tot_known_bsj:
                bsj_idx = 'all' if i == len(tot_known_bsj) - 1 else i
                out.write(
                    '9_Total_high_confidence_BSJs_known_in_{}\t{:,}\n'.format(
                        bsj_idx, tot_known_bsj[i]))
        else:
            out.write('9_Total_known_high_confidence_BSJs\t{:,}\n'.format(
                tot_known_bsj[0]))
        out.write(
            '10_Total_isoforms_with_high_BSJs\t{:,}\n'.format(tot_isoform))
        # out.write('11_Total_isoforms_with_high_BSJs_cano_SJs\t{:,}\n'.format(tot_iso_with_cano_sj))
        out.write('11_Total_isoforms_with_high_BSJs_high_FSJs\t{:,}\n'.format(
            tot_iso_with_high_sj))
        out.write('12_Total_isoforms_with_high_BSJ_known_SSs\t{:,}\n'.format(
            tot_iso_with_known_ss))
        out.write(
            '13_Total_isoforms_with_high_BSJs_high_FSJs_known_SSs\t{:,}\n'.
            format(tot_iso_with_high_sj_known_ss))
        out.write('14_Total_full_length_isoforms\t{:,}\n'.format(
            len(full_iso)))  #tot_full_iso))
        out.write('15_Total_reads_for_full_length_isoforms\t{:,}\n'.format(
            tot_full_read))
        # FSM/NIC/NNC
        out.write('16_Total_full_length_isoforms_with_FSM_BSJ\t{:,}\n'.format(
            tot_full_iso_bsj_fsm_iso))
        out.write(
            '17_Total_reads_for_full_length_isoforms_with_FSM_BSJ\t{:,}\n'.
            format(tot_full_iso_bsj_fsm_read))
        out.write('18_Total_full_length_isoforms_with_NIC_BSJ\t{:,}\n'.format(
            tot_full_iso_bsj_nic_iso))
        out.write(
            '19_Total_reads_for_full_length_isoforms_with_NIC_BSJ\t{:,}\n'.
            format(tot_full_iso_bsj_nic_read))
        out.write('20_Total_full_length_isoforms_with_NNC_BSJ\t{:,}\n'.format(
            tot_full_iso_bsj_nnc_iso))
        out.write(
            '21_Total_reads_for_full_length_isoforms_with_NNC_BSJ\t{:,}\n'.
            format(tot_full_iso_bsj_nnc_read))

        out.write('22_Total_full_length_isoform_with_FSM_FSJ\t{:,}\n'.format(
            tot_full_iso_int_fsm_iso))
        out.write(
            '23_Total_reads_full_length_isoforms_with_FSM_FSJ\t{:,}\n'.format(
                tot_full_iso_int_fsm_read))
        out.write('24_Total_full_length_isoforms_with_NIC_FSJ\t{:,}\n'.format(
            tot_full_iso_int_nic_iso))
        out.write(
            '25_Total_reads_for_full_length_isoforms_with_NIC_FSJ\t{:,}\n'.
            format(tot_full_iso_int_nic_read))
        out.write('26_Total_full_length_isoforms_with_NNC_FSJ\t{:,}\n'.format(
            tot_full_iso_int_nnc_iso))
        out.write(
            '27_Total_reads_for_full_length_isoforms_with_NNC_FSJ\t{:,}\n'.
            format(tot_full_iso_int_nnc_read))
    ut.err_format_time('basic_stats_core', 'Writing basic stats to file done!')
Ejemplo n.º 5
0
def restore_gff_db(gtf_fn):
    gtf_db = None
    if gtf_fn is not None:
        gtf_db_fn = gtf_fn + '.gffdb'
        if not os.path.isfile(gtf_db_fn):
            try:
                # check if 'gene' or 'transcript' is in GTF
                disable_gene, disable_trans = False, False
                with open(gtf_fn) as fp:
                    l = 0
                    for line in fp:
                        line = line.rstrip()
                        if len(line) < 1: continue
                        if line[0] != '#':
                            if line.split()[2] == 'gene':
                                disable_gene = True
                            elif line.split()[2] == 'transcript':
                                disable_trans = True
                            l += 1

                        if (disable_gene and disable_trans) or l == 100: break
                ut.err_format_time(
                    'restore_gtf_db',
                    'Creating GTF databases for {} ...'.format(gtf_fn))
                gtf_db = gu.create_db(gtf_fn,
                                      gtf_db_fn,
                                      disable_infer_genes=disable_gene,
                                      disable_infer_transcripts=disable_trans)
                ut.err_format_time(
                    'restore_gtf_db',
                    'Creating GTF databases for {} done!'.format(gtf_fn))

            except:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Error in parsing {}\nCheck if annotation file format is correct'
                    .format(gtf_fn))
                sys.exit(IOError)
        else:
            try:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Retrieving gff database for {} ...'.format(gtf_fn))
                gtf_db = gu.FeatureDB(gtf_db_fn)
                ut.err_format_time(
                    'restore_gtf_db',
                    'Retrieving gff database for {} done!'.format(gtf_fn))

            except:
                ut.err_format_time(
                    'restore_gtf_db',
                    'Error in parsing {}\nTry to remove this db file and re-run'
                    .format(gtf_db_fn))
                sys.exit(IOError)
    return gtf_db