def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir): # do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt")) #do_order_correction.main(events, 0, historyScores, usepreval=True) do_order_correction.main(events, 0, histScores) #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation histScores[np.where(histScores[:, 0] == refhistoryid), :] = 0 totalp = histseg.compute_likelihood_histories(histScores[:, 0], histScores) sys.stderr.write("totalp is %f\n" % totalp) types = histseg.Global_EVENTTYPES TP = np.zeros(len(types), dtype=int) FP = np.zeros(len(types), dtype=int) TN = np.zeros(len(types), dtype=int) FN = np.zeros(len(types), dtype=int) explained = np.zeros(len(types), dtype=int) FNesims = [] FPesims = [] myEdgeSimData = [] for event in events: myedgesim = EdgeSimulationData(event, histScores, totalp, refhistoryid) etype = event.determineEventType() if myedgesim.isTrue == 1: TP[0] += 1 TP[etype] += 1 elif myedgesim.isTrue == -1: FN[0] += 1 FN[etype] += 1 FNesims.append(myedgesim) elif myedgesim.isTrue == 0: FP[0] += 1 FP[etype] += 1 FPesims.append(myedgesim) myEdgeSimData.append(myedgesim) check_for_linear_decomp(FNesims, FPesims, explained) for i in xrange(len(TN)): FN[i] = FN[i] - TN[i] if datout_fh: header = "event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: datout_fh.write(str(edgesim)) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map( str, FP)), "\t".join(map(str, FN)), "\t".join( map(str, TN)), "\t".join(map(str, explained)))) tp = TP[0] + explained[0] fn = FN[0] - explained[0] f1score = float(2 * tp) / float(2 * tp + fn + FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints = histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
def run(self): self.logToMaster("Setting up...") opts = self.options histseg.Global_BINWIDTH = opts.binwidth sampleid = opts.sampleid outputdir = opts.outputdir historystatsfile = os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores = np.loadtxt(historystatsfile, dtype=int) self.totalp = histseg.compute_likelihood_histories( self.historyScores[:, 0], self.historyScores) #check that the *.pevnts file exists. pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts") if not os.path.exists(pevntsfile): sys.exit("The required %s file does not exist." % pevntsfile) pedgesfile = os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): self.logToMaster("Creating pedgesfile...%s" % pedgesfile) logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run() seghistfile = os.path.join(outputdir, "seghists.txt") if opts.sgh or not os.path.exists(seghistfile): self.logToMaster("Creating seghists file ... %s" % seghistfile) make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile) # label the seghists if an annotation file is given if opts.bedfile: labeledfn = os.path.join(outputdir, "seghists.labeled") if not os.path.exists(labeledfn): pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn) geneordfn = os.path.join(outputdir, "geneords.txt") if opts.geneords or not os.path.exists(geneordfn): seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn) mrgpeventsfile = os.path.join(outputdir, sampleid + ".pmevnts") if not os.path.exists(mrgpeventsfile): self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) logger.info("mrgpeventsfile: %s" % mrgpeventsfile) CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run() mrgpedgesfile = os.path.join(outputdir, sampleid + ".pmedgs") if not os.path.exists(mrgpedgesfile): self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run() breaksfile = os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): self.logToMaster("Creating breaksfile...%s" % breaksfile) breaklocs = histseg.get_breakpoints( pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2 = histseg.get_breakpoints( pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh = open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) # Creating links is no longer an option. #linksfile =os.path.join(outputdir, sampleid +".links") #if opts.links and not os.path.exists(linksfile): # self.logToMaster("Creating linksfile...%s" % linksfile) # logger.info("linksfile: %s" % linksfile) # self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) #Annotating Events is no longer an option. Seghists are annotated instead. #annotationfile=os.path.join(outputdir, "evnts.ann") #only create the annotations file here if we aren't doing gene ranking. Otherwise the gene rank option will create the annotation file for itself. #if opts.ann and not opts.generank and not os.path.exists(annotationsfile): # logger.info("annotationfile: %s" % annotationfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) # generank isn't an option - geneords is done instead using seghists. #generankfile=os.path.join(outputdir, "generanks.txt") # annotation file comes before generankfile (gene ranking depends on annotations.) #if opts.generank and not os.path.exists(generankfile): # self.logToMaster("Creating generankfile: %s" % generankfile) # logger.info("generankfile: %s" % generankfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.historyScores=np.loadtxt(historystatsfile, dtype=int) if opts.mcmcmix: self.logToMaster("Setting up MCMC analysis") mcmcdir = os.path.join(outputdir, "mcmcdata") mcmcdat = os.path.join(mcmcdir, "edge_counts.dat") mcmcdir = os.path.join(outputdir, "mcmcdata") mcmcdat = os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts = pevntsfile opts.pedges = pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: self.logToMaster("Setting up Simulation analysis") simoutput = os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget( SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2 = os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget( SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3 = os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget( SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir): # do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt")) #do_order_correction.main(events, 0, historyScores, usepreval=True) do_order_correction.main(events, 0, histScores) #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation histScores[np.where(histScores[:,0] == refhistoryid),:]=0 totalp=histseg.compute_likelihood_histories(histScores[:,0], histScores) sys.stderr.write("totalp is %f\n" % totalp) types=histseg.Global_EVENTTYPES TP=np.zeros(len(types), dtype=int) FP=np.zeros(len(types), dtype=int) TN=np.zeros(len(types), dtype=int) FN=np.zeros(len(types), dtype=int) explained=np.zeros(len(types), dtype=int) FNesims=[] FPesims=[] myEdgeSimData=[] for event in events: myedgesim=EdgeSimulationData(event, histScores, totalp, refhistoryid) etype=event.determineEventType() if myedgesim.isTrue==1: TP[0]+=1 TP[etype]+=1 elif myedgesim.isTrue==-1: FN[0]+=1 FN[etype]+=1 FNesims.append(myedgesim) elif myedgesim.isTrue==0: FP[0]+=1 FP[etype]+=1 FPesims.append(myedgesim) myEdgeSimData.append(myedgesim) check_for_linear_decomp(FNesims, FPesims, explained) for i in xrange(len(TN)): FN[i]=FN[i]-TN[i] if datout_fh: header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: datout_fh.write(str(edgesim)) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map(str, FP)), "\t".join(map(str, FN)), "\t".join(map(str, TN)), "\t".join(map(str, explained)) )) tp=TP[0]+explained[0] fn=FN[0]-explained[0] f1score = float(2*tp)/float(2*tp+fn+FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints=histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
def run(self): self.logToMaster("Setting up...") opts=self.options histseg.Global_BINWIDTH=opts.binwidth sampleid=opts.sampleid outputdir=opts.outputdir historystatsfile=os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores=np.loadtxt(historystatsfile, dtype=int) self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores) #check that the *.pevnts file exists. pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") if not os.path.exists(pevntsfile): sys.exit("The required %s file does not exist." % pevntsfile) pedgesfile=os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): self.logToMaster("Creating pedgesfile...%s" % pedgesfile) logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run() seghistfile=os.path.join(outputdir, "seghists.txt") if opts.sgh or not os.path.exists(seghistfile): self.logToMaster("Creating seghists file ... %s" % seghistfile) make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile) # label the seghists if an annotation file is given if opts.bedfile: labeledfn=os.path.join(outputdir, "seghists.labeled") if not os.path.exists(labeledfn): pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn) geneordfn=os.path.join(outputdir, "geneords.txt") if opts.geneords or not os.path.exists(geneordfn): seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn) mrgpeventsfile=os.path.join(outputdir, sampleid + ".pmevnts") if not os.path.exists(mrgpeventsfile): self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) logger.info("mrgpeventsfile: %s" % mrgpeventsfile) CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run() mrgpedgesfile=os.path.join(outputdir, sampleid + ".pmedgs") if not os.path.exists(mrgpedgesfile): self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run() breaksfile=os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): self.logToMaster("Creating breaksfile...%s" % breaksfile) breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh=open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) # Creating links is no longer an option. #linksfile =os.path.join(outputdir, sampleid +".links") #if opts.links and not os.path.exists(linksfile): # self.logToMaster("Creating linksfile...%s" % linksfile) # logger.info("linksfile: %s" % linksfile) # self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) #Annotating Events is no longer an option. Seghists are annotated instead. #annotationfile=os.path.join(outputdir, "evnts.ann") #only create the annotations file here if we aren't doing gene ranking. Otherwise the gene rank option will create the annotation file for itself. #if opts.ann and not opts.generank and not os.path.exists(annotationsfile): # logger.info("annotationfile: %s" % annotationfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) # generank isn't an option - geneords is done instead using seghists. #generankfile=os.path.join(outputdir, "generanks.txt") # annotation file comes before generankfile (gene ranking depends on annotations.) #if opts.generank and not os.path.exists(generankfile): # self.logToMaster("Creating generankfile: %s" % generankfile) # logger.info("generankfile: %s" % generankfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.historyScores=np.loadtxt(historystatsfile, dtype=int) if opts.mcmcmix: self.logToMaster("Setting up MCMC analysis") mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts=pevntsfile opts.pedges=pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: self.logToMaster("Setting up Simulation analysis") simoutput=os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2=os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3=os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))