def _pred_gene(ps): ### trans '''Main function of ORF prediction in given transcript ''' g, candidates, pf = ps es, j = [], [0, 0] tpfs = {} #trans profiles genome = fa.Fa(genomefapath) has_tis = len(tisbampaths) > 0 load = True if candidates is not None: if len(candidates) == 0: return es, j, tpfs, g if len(pf) >= len(candidates): load = False if len(pf) >= len(g.trans): load = False if load: tismbl = ribo.multiRiboGene(g, tisbampaths, offdict=tisoffdict, compatible=compatible, mis=compatiblemis, paired=paired) ribombl = ribo.multiRiboGene(g, ribobampaths, offdict=riboffdict, compatible=compatible, mis=compatiblemis, paired=paired) for t in g.trans: #if candidates is not None and t.id not in candidates : continue tl = t.cdna_length() if tl < ribo.minTransLen: continue # return es, j, tpfs, g ## #ttis = ribo.multiRibo(t, tisbampaths, offdict = tisoffdict, compatible = compatible) #tribo = ribo.multiRibo(t, ribobampaths, offdict = riboffdict, compatible = compatible) if not load: if t.id in pf: tispf, ribopf = pf[t.id] ttis = ribo.Ribo(t) tribo = ribo.Ribo(t) if has_tis: ttis.dict2cnts(tispf) tribo.dict2cnts(ribopf) else: print( 'Warning: transcript {} {} {} not in input trans profile! ' .format(t.gid, t.id, t.symbol)) continue else: ttis = ribo.Ribo(t, bamload=tismbl, compatible=compatible, mis=compatiblemis) tribo = ribo.Ribo(t, bamload=ribombl, compatible=compatible, mis=compatiblemis) score = ttis.abdscore() ip = ribo.pidx(score, slp) if verbose >= 2: print(io.tabjoin(g.id, t.id, ttis.total, tribo.total)) cds1 = t.cds_start(cdna=True) cds2 = t.cds_stop(cdna=True) tsq = genome.transSeq(t) if transprofile is not None: tid = '{}\t{}\t{}'.format(t.gid, t.id, t.symbol) tpfs[tid] = '{}\t{}'.format(ttis.cnts_dict_str(), tribo.cnts_dict_str()) if has_tis and tis2ribo: tribo.merge(ttis) ## # user provided candidates if candidates is not None: if t.id not in candidates: continue for tis, stop in candidates[t.id]: j[0] += 1 j[1] += 1 if has_tis: tp = ttis.tis_test(tis, paras[ip][0], paras[ip][1]) else: tp = None if enrichtest: rp = tribo.enrich_test(tis, stop) else: rp = tribo.frame_test(tis, stop) if tp is not None and tp > tpth: continue if rp > fpth: continue # or fisher > fspth minp = rp if tp is not None and tp < minp: minp = tp if minp > minpth: continue fsp, fss = stat.fisher_method([tp, rp]) # if fsp > fspth: continue has_stop = tsq[stop - 3:stop] in orf.cstop e = getResult(t, tis, stop, cds1, cds2, tsq, [ip, ttis.cnts[tis], tp, rp, 'N', fsp], has_stop) es.append(e) else: #all possible ORFs orfs = orf.orflist(tsq, minaalen=minaalen, tail=tl) for o in orfs: starts = o.starts if alt: starts += o.altstarts starts.sort() if longest and not has_tis: starts = starts[0:1] ol = len(starts) if ol == 0: continue tps = [None] * ol rps = [1] * ol if has_tis: allz_tis = max( ttis.cnts[starts[0]:o.stop:3]) == 0 # all zeros else: allz_tis = True allz_ribo = max( tribo.cnts[starts[0]:o.stop:3]) == 0 # all zeros if allz_tis and allz_ribo: continue for i, tis in enumerate(starts): if has_tis: tps[i] = ttis.tis_test(tis, paras[ip][0], paras[ip][1]) if not allz_ribo: allz_ribo = max(tribo.cnts[tis:o.stop:3]) == 0 if not allz_ribo: if enrichtest: rps[i] = tribo.enrich_test(tis, o.stop) else: rps[i] = tribo.frame_test(tis, o.stop) rst = pvalStatus(rps) for i, tis in enumerate(starts): if tps[i] is not None and tps[i] > tpth: continue if rps[i] > fpth: continue # or fishers[i] > fspth minp = rps[i] if tps[i] is not None and tps[i] < minp: minp = tps[i] if minp > minpth: continue if tps[i] is None or tps[i] > minpth: if longest: if i > 0: continue else: if framelocalbest and rst[i] == 'N': continue if framebest and rst[i][0] != 'T': continue fsp, fss = stat.fisher_method([tps[i], rps[i]]) # if fsp > fspth: continue e = getResult( t, tis, o.stop, cds1, cds2, tsq, [ip, ttis.cnts[tis], tps[i], rps[i], rst[i], fsp], o.has_stop_codon) #tistype = tisType(tis, o.stop, cds1, cds2) #orfstr = '{}\t{}\t{}'.format(tsq[tis:tis+3],tis,o.stop) #tid = "%s\t%s\t%s\t%s\t%s:%d-%d:%s\t%s\t%s" % (t.gid, t.id, t.symbol, t.genetype, t.chr, t.genome_pos(tis), t.genome_pos(o.stop), t.strand, orfstr, tistype) #values = [ip, ttis.cnts[tis], tps[i], rps[i], rst[i]] # , fishers[i]] #e = exp.Exp(tid, values) #e.length = (o.stop - tis) / 3 - 1 #e.sq = tsq[tis:o.stop] #e.chr, e.strand, e.tistype = t.chr, t.strand, tistype #if e.tistype == 'Extended' : e.cr = interval.cds_region_trans(t, tis, tis+3) #else : e.cr = interval.cds_region_trans(t, tis, o.stop) es.append(e) #if has_tis : j[1] += ol j[0] += 1 return es, j, tpfs, g
def run(args): '''Main function for differential TIS ''' global ipth, iqth, tis1bampaths, tis2bampaths, tis1offdict, tis2offdict, compatible, compatiblemis, paired ipth, iqth = args.ipth, args.iqth tis1bampaths = args.tis1bampaths tis2bampaths = args.tis2bampaths ribo.maxNH, ribo.minMapQ, ribo.secondary = args.maxNH, args.minMapQ, args.secondary compatible = not args.nocompatible compatiblemis = args.compatiblemis paired = args.paired if len(tis1bampaths) < len(args.tis1paths) or len(tis2bampaths) < len( args.tis2paths): # == 0 : print('Missing bam file input!') exit(1) if args.chrmap is not None: chrmap = {} for lst in io.splitIter(args.chrmap): chrmap[lst[0]] = lst[1] chrmap[lst[1]] = lst[0] bam.chrmap = chrmap fa.chrmap = chrmap global tis1bampathslist, tis2bampathslist, tis1offdictlist, tis2offdictlist tis1bampathslist = [s.split(';') for s in args.tis1bampaths] tis2bampathslist = [s.split(';') for s in args.tis2bampaths] if args.tis1para is None: tis1paralist = [None] * len(args.tis1paths) else: tis1paralist = [s.split(';') for s in args.tis1para] if args.tis2para is None: tis2paralist = [None] * len(args.tis2paths) else: tis2paralist = [s.split(';') for s in args.tis2para] tis1offdictlist = [ find_offset(bampaths, para) for bampaths, para in zip(tis1bampathslist, tis1paralist) ] tis2offdictlist = [ find_offset(bampaths, para) for bampaths, para in zip(tis2bampathslist, tis2paralist) ] if len(args.tis1labels) < len(args.tis1paths): for i in range(len(args.tis1labels), len(args.tis1paths)): args.tis1labels.append(args.tis1paths[i]) if len(args.tis2labels) < len(args.tis2paths): for i in range(len(args.tis2labels), len(args.tis2paths)): args.tis2labels.append(args.tis2paths[i]) title = args.tis1labels + args.tis2labels tis_title = ['TIS_' + lab for lab in title] rna_title = ['RNA_' + lab for lab in title] l = len(title) if args.rnaseq is not None: if args.verbose: print("Loading RNASeq data...") rna_profile = exp.Profile() for lst in io.splitIter(args.rnaseq): try: values = map(int, lst[1:]) except: try: values = map(float, lst[1:]) print('Error: RNASeq data should be integers {}.'.format( lst)) sys.exit(1) except ValueError: pass continue m = min(values) if m < 0: print('Error: RNASeq data should be non-negative integers {}.'. format(lst)) sys.exit(1) e = exp.Exp(lst[0], values) rna_profile.add_exp(e) if args.verbose: print("{} genes.".format(len(rna_profile))) if args.verbose: print("Loading {} TIS data...".format(l)) gname, gpos, gsig = {}, {}, {} tall = [] # {}, {} gid = {} anno = {} # annotated TIS for i, fname in enumerate(args.tis1paths + args.tis2paths): n = 0 tdata = {} for lst in io.splitIter(fname): try: tis = (lst[1], int(lst[6])) except: continue gid[lst[1]] = lst[0] cnt, pval, qval = int(lst[10]), float(lst[11]), float(lst[args.qi]) tdata[tis] = cnt, pval, qval lst[4] = get_tis(lst[4]) if lst[8] == 'Annotated': anno[lst[4]] = 1 # genome position if sig(tdata[tis]): n += 1 #lst[4] = get_tis(lst[4]) gname[tis] = '\t'.join(lst[:9]) # information for the TIS gpos[tis] = lst[4] if tis not in gsig: gsig[tis] = [0] * len(tis_title) gsig[tis][i] = 1 if args.verbose: print("{} TISs in {}.".format(n, fname)) tall.append(tdata) profile = exp.Profile() profile2 = exp.Profile() # uniq TISs for TMM trans_for_bam = {} # TIS genes need to be analyzed es = {} uniq_gpos = {} for tis in gname: # t1 : values = [] for i, tdata in enumerate(tall): if tis not in tdata: if tis[0] not in trans_for_bam: trans_for_bam[tis[0]] = [{} for j in range(l)] trans_for_bam[tis[0]][i][tis[1]] = None values.append(None) else: values.append(tdata[tis][0]) if args.rnaseq is not None: if tis[0] in rna_profile.exps: values += rna_profile.exps[tis[0]].data # trans level elif gid[tis[0]] in rna_profile.exps: values += rna_profile.exps[gid[tis[0]]].data # gene level else: print('Warning: transcript {} {} is not found in RNA file!'. format(gid[tis[0]], tis[0])) values += [0] * l # len(title) e = exp.Exp(gname[tis], values) e.tis = tis es[tis] = e profile.add_exp(e) if gpos[tis] not in uniq_gpos: if args.normanno and gpos[tis] not in anno: continue if args.normcomm: for tdata in tall: if tis not in tdata or not sig(tdata[tis]): break else: profile2.add_exp(e) #elif args.normanno : #if gpos[tis] in anno : profile2.add_exp(e) else: profile2.add_exp(e) uniq_gpos[gpos[tis]] = 1 if args.verbose: print("{} TISs in total.".format(len(profile))) if not args.normcomm: uns = [0] * len(tis_title) for i in range(len(tis_title)): uns[i] = len([i for e in profile2 if gsig[e.tis][i] == 1]) m = min(uns) elst = list(profile2.exps.values()) profile3 = exp.Profile() for i in range(len(tis_title)): for e in elst: e.value[0:2] = [gsig[e.tis][i], e.data[i]] elst.sort(reverse=True) for j in range(m): if elst[j].id not in profile3.exps: profile3.add_exp(elst[j]) profile2 = profile3 if args.verbose: print("Reading bams...") trans_iter = io.transIter(args.genepath, fileType=args.geneformat, verbose=args.verbose, filt=trans_for_bam) para_iter = transPara(trans_iter, trans_for_bam) if args.numProc <= 1: pred_iter = itertools.imap(_get_tis, para_iter) else: from multiprocessing import Pool pool = Pool(processes=args.numProc - 1) pred_iter = pool.imap_unordered(_get_tis, para_iter, chunksize=5) for result in pred_iter: tid, pos_cnt = result # r1, r2 = result for i, pc in enumerate(pos_cnt): for pos in pc: tis = tid, pos es[tis].data[i] = pc[pos] if len(args.tis1paths) > 1 or len( args.tis2paths) > 1 or args.export is not None: if args.export is None: args.export = 'tisdiff_export.txt' if args.verbose: print('Export TIS counts table to {}.'.format(args.export)) exfile = open(args.export, 'w') if args.rnaseq is None: exfile.write(io.tabjoin('TIS', tis_title) + '\n') # args.tis1labels, args.tis2labels)+'\n') else: exfile.write(io.tabjoin('TIS', tis_title, rna_title) + '\n') for tis in gname: s = '{}_{}_{}\t'.format(tis[0], tis[1], gpos[tis]) s += io.tabjoin(es[tis].data) exfile.write(s + '\n') return if args.scalefactor is not None: scale = args.scalefactor else: if args.verbose: print('Estimate scale factor...') f = profile2.TMM(i1=0, i2=1) if args.verbose: print('TIS TMM log2 f = {}'.format(f)) scale = 2**(-f) if args.rnaseq is not None: if args.rnascale is not None: scale_r = args.rnascale else: fr = rna_profile.TMM(i1=0, i2=1) # for only one replicate if args.verbose: print('RNASeq TMM log2 f = {}'.format(fr)) scale_r = 2**(-fr) if args.verbose: print('Diff test...') exps = profile.exps.values() for e in exps: if args.rnaseq is None: p = 1 / (scale + 1) x, y = e.data[0], e.data[1] # [2] n = x + y if x == 0: fc, alt = 'INF', 'less' elif y == 0: fc, alt = 0, 'greater' else: fc = y / (1.0 * x * scale) # / y if scale * x <= y: alt = 'less' else: alt = 'greater' pv = stat.binom_test(n, x, p=p, alt=alt) else: x, y, r1, r2 = e.data[0:4] if x == 0: fc, alt = 'INF', 'less' elif y == 0 or r1 == 0: fc, alt = 0, 'greater' elif r2 == 0: fc, alt = 'INF', 'less' else: fc = y / (1.0 * x * scale) / (r2 / (1.0 * r1 * scale_r)) if fc >= 1: alt = 'less' # test x else: alt = 'greater' if args.chi2: pv = ribo.TIStest_chi2(x, y, r1, r2, scale, scale_r, alt=alt) elif args.betabinom: pv = ribo.TIStest_betaBinom(x, y, r1, r2, scale, scale_r, alt=alt) else: pv = ribo.TIStest_FisherExact(x, y, r1, r2, scale, scale_r, alt=alt) pv *= 2 # two tailed if pv > 1: pv = 1 e.data.append(fc) e.data.append(pv) result = profile.BHcorrection(-1, append=True) # (5) if args.verbose: print('Output...') outfile = open(args.output, 'w') s = "Gid\tTid\tSymbol\tGeneType\tGenomePos\tStartCodon\tStart\tStop\tTisType\t" s += '\t'.join(tis_title) if args.rnaseq is not None: s += '\t' + '\t'.join(rna_title) s += '\tFoldChange\tDiffPvalue\tDiffQvalue\n' outfile.write(s) for e in profile: fc = e.data[-3] e.is_q = e.is_fc = True if fc != 'INF' and fc != 0 and max(fc, 1 / fc) < args.foldchange: e.is_fc = False if e.data[-2] > args.opth or e.data[-1] > args.oqth: e.is_q = False if e.is_q and e.is_fc: outfile.write(str(e) + '\n') # Plot if args.plotout is not None: if args.verbose: print("Ploting...") from zbio import plot plot.figure(figsize=args.figsize) if args.rnaseq is not None: qd1 = [ math.log(e.data[0] + 1, 2) - math.log(e.data[2] + 1, 2) for e in exps if e.is_q and e.is_fc ] qd2 = [ math.log(e.data[1] + 1, 2) - math.log(e.data[3] + 1, 2) for e in exps if e.is_q and e.is_fc ] pd1 = [ math.log(e.data[0] + 1, 2) - math.log(e.data[2] + 1, 2) for e in exps if e.is_q and not e.is_fc ] pd2 = [ math.log(e.data[1] + 1, 2) - math.log(e.data[3] + 1, 2) for e in exps if e.is_q and not e.is_fc ] nd1 = [ math.log(e.data[0] + 1, 2) - math.log(e.data[2] + 1, 2) for e in exps if not e.is_q ] nd2 = [ math.log(e.data[1] + 1, 2) - math.log(e.data[3] + 1, 2) for e in exps if not e.is_q ] else: qd1 = [ math.log(e.data[0] + 1, 2) for e in exps if e.is_q and e.is_fc ] qd2 = [ math.log(e.data[1] + 1, 2) for e in exps if e.is_q and e.is_fc ] pd1 = [ math.log(e.data[0] + 1, 2) for e in exps if e.is_q and not e.is_fc ] pd2 = [ math.log(e.data[1] + 1, 2) for e in exps if e.is_q and not e.is_fc ] nd1 = [math.log(e.data[0] + 1, 2) for e in exps if not e.is_q] nd2 = [math.log(e.data[1] + 1, 2) for e in exps if not e.is_q] plot.scatter(qd1, qd2, alpha=0.1, edgecolors='none', color='r', label='q < {} & FC > {}'.format(args.oqth, args.foldchange)) plot.scatter(pd1, pd2, alpha=0.1, edgecolors='none', color='y', label='q < {} & FC <= {}'.format(args.oqth, args.foldchange)) plot.scatter(nd1, nd2, alpha=0.1, edgecolors='none', color='g', label='q >= {}'.format(args.oqth)) plot.legend(loc='upper left', frameon=False) plot.xlabel(title[0]) plot.ylabel(title[1]) if args.rnaseq is not None: d = (fr - f) / 2 else: d = -f / 2 m1 = max(min(qd1 + pd1 + nd1), min(qd2 + pd2 + nd2)) m2 = min(max(qd1 + pd1 + nd1), max(qd2 + pd2 + nd2)) plot.plot([m1 - d, m2 - d], [m1 + d, m2 + d], color='k', linestyle=':') d2 = d - math.log(args.foldchange, 2) / 2 plot.plot([m1 - d2, m2 - d2], [m1 + d2, m2 + d2], color='r', linestyle=':') d2 = d + math.log(args.foldchange, 2) / 2 plot.plot([m1 - d2, m2 - d2], [m1 + d2, m2 + d2], color='r', linestyle=':') plot.save(args.plotout) if args.plotma is not None: exps = profile2.exps.values() plot.figure(figsize=args.figsize) plot.axhline(f) ms = [e.M for e in exps if hasattr(e, 'select') and e.select] aa = [e.A for e in exps if hasattr(e, 'select') and e.select] plot.scatter(aa, ms, alpha=0.1, edgecolors='none', color='r') ms = [e.M for e in exps if hasattr(e, 'select') and not e.select] aa = [e.A for e in exps if hasattr(e, 'select') and not e.select] plot.scatter(aa, ms, alpha=0.1, edgecolors='none', color='b') plot.save(args.plotma)
def run(args): '''Main function for ORF finding ''' # prepare global tisbampaths, tisoffdict, ribobampaths, riboffdict, genomefapath, compatible, compatiblemis global minaalen, enrichtest, slp, paras, verbose, alt, title, tis2ribo, gfilter global tpth, fpth, minpth, fspth, framebest, framelocalbest, longest, transprofile, TIS_types #fspth global paired, seq, aaseq, blocks # showtime paired, seq, aaseq, blocks = args.paired, args.seq, args.aaseq, args.blocks ribo.maxNH, ribo.minMapQ, ribo.secondary = args.maxNH, args.minMapQ, args.secondary tisbampaths = args.tisbampaths ribobampaths = args.ribobampaths if len(tisbampaths) == 0 and len(ribobampaths) == 0: print('No bam file input!') exit(1) genomefapath = args.genomefapath compatible = not args.nocompatible compatiblemis = args.compatiblemis minaalen = args.minaalen enrichtest = args.enrichtest transprofile = args.transprofile harrwidth = None TIS_types = [ 'Annotated', 'Truncated', 'Extended', "5'UTR", "3'UTR", 'Internal', 'Novel' ] if args.chrmap is not None: chrmap = {} for lst in io.splitIter(args.chrmap): chrmap[lst[0]] = lst[1] chrmap[lst[1]] = lst[0] bam.chrmap = chrmap fa.chrmap = chrmap if args.harrwidth is not None: harrwidth = args.harrwidth elif args.harr: harrwidth = 15 verbose = args.verbose alt = args.alt if args.altcodons is not None: alt = True if args.altcodons[0].upper() == 'ALL': orf.cstartlike = orf.allcodons else: orf.cstartlike = [c.upper() for c in args.altcodons] tpth, fpth, minpth, framebest, framelocalbest = args.tpth, args.fpth, args.minpth, args.framebest, args.framelocalbest # fspth fspth = args.fspth longest = args.longest tis2ribo = args.tis2ribo parts = [0.1 * (i + 1) for i in range(args.nparts)] gfilter = None if args.genefilter is not None: gfilter = {} for gid in args.genefilter: gfilter[gid] = 1 flank = 3 ## tisoffdict = find_offset(args.tisbampaths, args.tispara) riboffdict = find_offset(args.ribobampaths, args.ribopara) if len(args.ribobampaths) == 0: print( 'No regular RiboSeq data input. TIS data will also be used as regular RiboSeq data.' ) tis2ribo = True if len(args.tisbampaths) == 1: if args.inestpath is None: path = args.tisbampaths[0] + '.bgest.txt' if isfile(path): args.inestpath = path else: args.estpath = path if args.agenepath is None: args.agenepath = args.genepath # load genome, fasta file indexing if args.verbose: print("{} Loading genome...".format(time.ctime())) genome = fa.Fa(args.genomefapath, verbose=args.verbose) # TIS background estimation if len(args.tisbampaths) == 0: print('No input TIS data!') paras, slp = [(1, 0.5)], [1] # No TIS input elif args.inestpath is None: #== '' : print("{} Estimating TIS background parameters...".format( time.ctime())) if args.verbose: print( "TIS background estimation result will be saved to {}".format( args.estpath)) if args.numProc > 1: from multiprocessing import Process import multiprocessing.pool class NoDaemonProcess(Process): # make 'daemon' attribute always return False def _get_daemon(self): return False def _set_daemon(self, value): pass daemon = property(_get_daemon, _set_daemon) class MyPool(multiprocessing.pool.Pool): Process = NoDaemonProcess pool = MyPool(1) # This is for memory efficiency paras, slp, data = pool.apply(ribo.estimateTISbg, args=(args.agenepath, args.tisbampaths, args.genomefapath), kwds={ 'parts': parts, 'offdict': tisoffdict, 'numProc': args.numProc, 'verbose': args.verbose, 'geneformat': args.geneformat, 'harrwidth': harrwidth, 'paired': paired }) pool.close() else: paras, slp, data = ribo.estimateTISbg(args.genepath, args.tisbampaths, args.genomefapath, parts=parts, offdict=tisoffdict, numProc=1, verbose=verbose, geneformat=args.geneformat, harrwidth=harrwidth, paired=paired) estfile = open(args.estpath, 'w') for i in range(len(parts)): estfile.write("{}\t{}\t{}\t{}\t{}\n".format( paras[i][0], paras[i][1], parts[i], slp[i], data[i])) estfile.close() else: inestfile = open(args.inestpath, 'r') paras, slp = [], [] for l in inestfile: lst = l.strip().split('\t') paras.append((float(lst[0]), float(lst[1]))) slp.append(eval(lst[3])) if args.inprofile is not None and not isfile(args.inprofile): print('inprofile {} not found!'.format(args.inprofile)) if args.transprofile is None: transprofile = args.inprofile if args.numProc > 1: from multiprocessing import Pool pool = Pool(processes=args.numProc - 1) cds_regions = {} known_tis = {} if args.agenepath != args.genepath: if verbose: print('Loading CDS annotation...') for g in io.geneIter(args.agenepath, fileType=args.geneformat, chrs=genome.idx, verbose=args.verbose): if g.chr not in cds_regions: cds_regions[g.chr] = { '+': [interval.Interval() for i in range(3)], '-': [interval.Interval() for i in range(3)] } known_tis[g.chr] = {'+': {}, '-': {}} for t in g.trans: cr = interval.cds_region_trans(t) for i in range(3): cds_regions[t.chr][t.strand][i].lst += cr[i].lst #for t in g.trans : tis = t.cds_start(cdna=False) if tis is not None: known_tis[t.chr][t.strand][tis] = 1 inorf = None if args.input is not None: if verbose: print('Loading candidates...') inorf = {} infile = open(args.input, 'r') for l in infile: lst = l.strip().split() tid, tis, stop = lst[0], int(lst[1]), int(lst[2]) #if gfilter is not None and tid not in gfilter : continue if tid not in inorf: inorf[tid] = [] inorf[tid].append([tis, stop]) inprofile = None if args.inprofile is not None: if isfile(args.inprofile): if verbose: print('Loading transcript profile...') inprofile = {} for lst in io.splitIter(args.inprofile): try: gid, tid, tispf, ribopf = lst[0], lst[1], eval( lst[3]), eval(lst[4]) except: continue if gid not in inprofile: inprofile[gid] = {} inprofile[gid][tid] = tispf, ribopf print("{} Predicting...".format(time.ctime())) profile = exp.Profile() title = ['TISGroup', 'TISCounts', 'TISPvalue', 'RiboPvalue', 'RiboPStatus'] j = [0, 0] # total number of ORF/TIS for BH correction gene_iter = io.geneIter(args.genepath, fileType=args.geneformat, chrs=genome.idx, verbose=args.verbose) para_iter = genePara(gene_iter, inorf, inprofile) if args.numProc <= 1: pred_iter = itertools.imap(_pred_gene, para_iter) else: #from multiprocessing import Pool #pool = Pool(processes = args.numProc - 1) pred_iter = pool.imap_unordered(_pred_gene, para_iter, chunksize=5) if transprofile is not None: tpfile = open(transprofile, 'w') tpfile.write('Gid\tTid\tSymbol\tTISProf\tRiboProf\n') for result in pred_iter: es, ji, tpfs, g = result j[0] += ji[0] j[1] += ji[1] for e in es: profile.add_exp(e) if verbose >= 2: print('{} {}'.format(time.ctime(), str(e))) if transprofile is not None: for tid in tpfs: tpfile.write(io.tabjoin(tid, tpfs[tid]) + '\n') if g.chr not in cds_regions: cds_regions[g.chr] = { '+': [interval.Interval() for i in range(3)], '-': [interval.Interval() for i in range(3)] } known_tis[g.chr] = {'+': {}, '-': {}} for t in g.trans: cr = interval.cds_region_trans(t) for i in range(3): cds_regions[t.chr][t.strand][i].lst += cr[i].lst #for t in g.trans : tis = t.cds_start(cdna=False) if tis is not None: known_tis[t.chr][t.strand][tis] = 1 for chr in cds_regions: for strand in cds_regions[chr]: for i in range(3): cds_regions[chr][strand][i].check() print("{} Checking overlap with known CDS..".format(time.ctime())) for e in profile: if e.tistype == 0: continue elif e.gtis in known_tis[e.chr][e.strand]: e.id += ':Known' elif e.tistype > 1: # ["5'UTR", "3'UTR", "Inside", "Novel", 'Extended'] : #coding_overlap = False for i in range(3): its = cds_regions[e.chr][e.strand][i].intersect( e.cr[i] ) # e.cr[i].intersect(cds_regions[e.chr][e.strand][i]) if its.rlen() > 0: #coding_overlap = True e.id += ':CDSFrameOverlap' break print("{} BH correcting...".format(time.ctime())) profile.BHcorrection(2, total=j[1], append=True) # Calculate BH FDR of TIS p value profile.BHcorrection(3, total=j[0], append=True) # Frame p value i = 1 if len(tisbampaths) == 0: i = 0 profile.BHcorrection(5, total=j[i], append=True) # Calculate BH FDR for Fisher's p value outfile = open(args.output, 'w') s = "Gid\tTid\tSymbol\tGeneType\tGenomePos\tStartCodon\tStart\tStop\tTisType\t" s += '\t'.join(title) s += '\tFisherPvalue\tTISQvalue\tFrameQvalue\tFisherQvalue\tAALen' if seq: s += '\tSeq' if aaseq: s += '\tAASeq' if blocks: s += '\tBlocks' s += '\n' outfile.write(s) if args.allresult is not None and args.allresult.upper() == 'OFF': allout = None elif args.fsqth == 1: allout = None else: if args.allresult is None: lst = args.output.split('.') if lst[-1] == 'txt': args.allresult = args.output[:-4] + '_all.txt' else: args.allresult = args.output + '_all.txt' allout = open(args.allresult, 'w') allout.write(s) for e in profile: #if e.q > args.fsqth : continue if len(tisbampaths) == 0: e.data[5], e.data[8] = None, None # No Fisher's s = "%s\t%d" % (e, e.length) if seq: s += '\t' + e.sq if aaseq: s += '\t' + e.aa if blocks: s += '\t' + e.blocks s += '\n' if allout is not None: allout.write(s) if e.q <= args.fsqth: outfile.write(s) # "%s\t%d\n" % (e, e.length)) #, e.sq))