def analyze_simulation(edges, refhistoryid, historyScores, datout_fh, stats_fh, breaks_fh): #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation myhistScores=np.copy(historyScores) myhistScores[np.where(historyScores[:,0] == refhistoryid),:]=0 totalp=histseg.compute_likelihood_histories(myhistScores[:,0], myhistScores) TP=[0,0,0,0] FP=[0,0,0,0] TN=[0,0,0,0] FN=[0,0,0,0] FNedges=[] types=histseg.Global_EVENTTYPES myEdgeSimData=[] # a list of tuples (edge, isTrue, refpreval, reforder) for edge in edges: if not edge.histories: edge.histories=histseg.listout_ranges(edge.histRanges) myedgesim=EdgeSimulationData(edge) type=myedgesim.type if refhistoryid in edge.histories: refindex=edge.histories.index(refhistoryid) myedgesim.refindex=refindex if len(edge.histories)>1: TP[0]+=1 TP[type]+=1 myedgesim.isTrue=1 edge.histories.pop(refindex) myedgesim.refpreval=edge.prevals.pop(refindex) myedgesim.reforder=edge.orders.pop(refindex) edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp) edge.compute_timing_wmeansd(myhistScores) edge.histories.insert(refindex, refhistoryid) edge.prevals.insert(refindex, myedgesim.refpreval) edge.orders.insert(refindex, myedgesim.reforder) upperc=edge.uppercosts.pop(refindex) lowerc=edge.lowercosts.pop(refindex) myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) edge.uppercosts.insert(refindex, upperc) edge.lowercosts.insert(refindex, lowerc) else: FN[0]+=1 FN[type]+=1 FNedges.append(myedgesim) myedgesim.isTrue=-1 edge.likelihood=1 myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) myedgesim.refpreval=edge.prevals[refindex] myedgesim.reforder=edge.orders[refindex] else: FP[0]+=1 FP[type]+=1 edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp) if edge.likelihood >1: sys.stderr.write("bad lscore: %s\t%s\t%d\n" % (str(edge.likelihood), str(totalp), len(edge.costs))) myedgesim.isTrue=0 myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) myEdgeSimData.append(myedgesim) if len(FNedges) >0: TN=checkForCancellingEdges(FNedges) #this will also modify the isTrue value of FNedges for i in xrange(len(TN)): FN[i]=FN[i]-TN[i] if datout_fh: header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: edge=edgesim.edge prevals=",".join(map(str, [edgesim.refpreval, edge.prevalmean, edge.prevalsd])) orders=",".join(map(str, [edgesim.reforder, edge.ordermean, edge.ordersd])) type=edge.determineEventType() length=edge.get_Event_length() mystr="\t".join(map(str, [edge.id, types[type], edgesim.avecost, edge.likelihood, edge.cnval, edgesim.isTrue, length, prevals, orders, len(edge.histories)])) + "\n" datout_fh.write(mystr) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map(str, FP)), "\t".join(map(str, FN)), "\t".join(map(str, TN)) )) f1score = float(2*TP[0])/float(2*TP[0]+FN[0]+FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints=histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
def run(self): self.logToMaster("Setting up...") opts=self.options histseg.Global_BINWIDTH=opts.binwidth sampleid=opts.sampleid outputdir=opts.outputdir subprocess.call("mkdir -p %s" % outputdir, shell=True) historystatsfile=os.path.join(outputdir, "historystats.txt") pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") if opts.pevnts or not os.path.exists(pevntsfile): logger.info("pevntsfile: %s" % pevntsfile) CreatePevntsFile(pevntsfile, historystatsfile, opts).run() self.historyScores=np.loadtxt(historystatsfile, dtype=int) logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH) self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores) logger.info("totalp is %s" % str(self.totalp)) pedgesfile=os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, ignore_cn=False).run() mrgpedgesfile=os.path.join(outputdir, sampleid + ".mrgpedgs") if not os.path.exists(mrgpedgesfile): logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), mrgpedgesfile, self.historyScores, self.totalp, ignore_cn=True).run() linksfile =os.path.join(outputdir, sampleid +".links") if opts.links and not os.path.exists(linksfile): logger.info("linksfile: %s" % linksfile) self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) breaksfile=os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh=open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) annotationfile=os.path.join(outputdir, sampleid + ".ann") generankfile=os.path.join(outputdir, sampleid + ".gnrank") if opts.generank and not os.path.exists(generankfile): logger.info("generankfile: %s" % generankfile) if not self.events: self.events=pickle.load(open(pevntsfile, 'rb')) self.addChildTarget(CreateGeneRankFile(self.events, opts.tabixfile, self.totalp, annotationfile, generankfile)) logger.info("after creating generankfile") elif opts.ann and not opts.generank: logger.info("annotationfile: %s" % annotationfile) if not self.events: self.events=pickle.load(open(pevntsfile, 'rb')) self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) if opts.mcmcmix: mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts=pevntsfile opts.pedges=pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: simoutput=os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2=os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3=os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))