def score_edges_within_pevents(allevents, historyScores, totalp, prev_error=0.05, ignore_cn=True): prevalence_error = prev_error sys.stderr.write("number of events: %d\n" % (len(allevents))) alledges = [] for event in allevents: event.unpack() # if event.segs == []: # event.make_segs_from_str() for seg in event.segs: edge = copy.deepcopy(event) edge.segs = [seg] edge.make_segstr() if ignore_cn: (mysegstr, sign) = histseg.remove_signs_from_segstr(edge.segstr) edge.segstr = "+/" + mysegstr if sign == "-": edge.cnval = -1 * event.cnval alledges.append(edge) # sys.stderr.write("number of edges is: %d\n" % (len(alledges))) sortededges = sorted(alledges, key=lambda x: (x.segstr, x.cnval)) if ignore_cn: unique_edges = unique_loc_edges(sortededges) else: unique_edges = unique_c_edges(sortededges) sys.stderr.write("totalp: %s\n" % (str(totalp))) for edge in unique_edges: edge.update(historyScores) edge.likelihood = histseg.compute_likelihood_histories(edge.histories, historyScores, totalp) edge.trim() return unique_edges
def score_and_link_cycles(args): if args.binwidth: histseg.Global_BINWIDTH=args.binwidth if not args.historystats and args.cnavg: historyScores=histseg.combine_history_statsfiles(args.cnavg) elif not os.path.isfile(args.historystats) and args.cnavg: historyScores=histseg.combine_history_statsfiles(args.cnavg) np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t') elif os.path.isfile(args.historystats): historyScores=np.loadtxt(args.historystats, dtype=int) if not historyScores: sys.exit("Need to use --historystats or --cnavg option") totalp=0 if args.totalp: totalp=args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores) allevents=[] if args.cnavg: sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg)) allevents=histseg.get_events_from_cnavgdir(args.cnavg, historyScores, totalp) elif args.inpickle and os.path.isfile(args.inpickle): sys.stderr.write("using pickle file\n") allevents=pickle.load(open(args.inpickle, 'rb')) sys.stderr.write("there are %d events\n" % (len(allevents))) if args.outpickle: for event in allevents: event.trim() eventfile= open(args.outpickle, 'wb') pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL) if args.events: eventfile=open(args.events, 'w') for evnt in allevents: eventfile.write("%s" % (str(evnt))) # link the events... if args.links: if not allevents: sys.exit("Need events to link! use --inpickle or --cnavg or --events") if not totalp: sys.exit("Need a --totalp or --cnavg or --historystats options") eventlinks = link_events_by_order_within_histories(allevents) linkfile=open(args.links, 'w') for link in eventlinks: link.likelihood=histseg.compute_likelihood_histories(link.histories, historyScores) linkfile.write("%s" % (str(link)))
def run(self): self.logToMaster("CreatePevntsFile\n") opts=self.options if opts.simulation: truefile=os.path.join(opts.cnavgout, "true.braney") truehist=os.path.join(opts.cnavgout, "HISTORIES_0.braney") subprocess.call("grep -v ^$ %s | gzip > %s" % (truefile, truehist), shell=True) make_STATS_from_truebraney(truefile, os.path.join(opts.cnavgout, "HISTORY_STATS_0")) historyScores=histseg.combine_history_statsfiles(opts.cnavgout) np.savetxt(self.historystatsfile, historyScores, fmt='%d', delimiter='\t') totalp=histseg.compute_likelihood_histories(historyScores[:,0], historyScores) events=histseg.get_events_from_cnavgdir(opts.cnavgout, historyScores, totalp) pickle.dump(events, open(self.pevntsfile, 'wb'), pickle.HIGHEST_PROTOCOL)
def analyze_simulation(edges, refhistoryid, historyScores, datout_fh, stats_fh, breaks_fh): #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation myhistScores=np.copy(historyScores) myhistScores[np.where(historyScores[:,0] == refhistoryid),:]=0 totalp=histseg.compute_likelihood_histories(myhistScores[:,0], myhistScores) TP=[0,0,0,0] FP=[0,0,0,0] TN=[0,0,0,0] FN=[0,0,0,0] FNedges=[] types=histseg.Global_EVENTTYPES myEdgeSimData=[] # a list of tuples (edge, isTrue, refpreval, reforder) for edge in edges: if not edge.histories: edge.histories=histseg.listout_ranges(edge.histRanges) myedgesim=EdgeSimulationData(edge) type=myedgesim.type if refhistoryid in edge.histories: refindex=edge.histories.index(refhistoryid) myedgesim.refindex=refindex if len(edge.histories)>1: TP[0]+=1 TP[type]+=1 myedgesim.isTrue=1 edge.histories.pop(refindex) myedgesim.refpreval=edge.prevals.pop(refindex) myedgesim.reforder=edge.orders.pop(refindex) edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp) edge.compute_timing_wmeansd(myhistScores) edge.histories.insert(refindex, refhistoryid) edge.prevals.insert(refindex, myedgesim.refpreval) edge.orders.insert(refindex, myedgesim.reforder) upperc=edge.uppercosts.pop(refindex) lowerc=edge.lowercosts.pop(refindex) myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) edge.uppercosts.insert(refindex, upperc) edge.lowercosts.insert(refindex, lowerc) else: FN[0]+=1 FN[type]+=1 FNedges.append(myedgesim) myedgesim.isTrue=-1 edge.likelihood=1 myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) myedgesim.refpreval=edge.prevals[refindex] myedgesim.reforder=edge.orders[refindex] else: FP[0]+=1 FP[type]+=1 edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp) if edge.likelihood >1: sys.stderr.write("bad lscore: %s\t%s\t%d\n" % (str(edge.likelihood), str(totalp), len(edge.costs))) myedgesim.isTrue=0 myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts)) myEdgeSimData.append(myedgesim) if len(FNedges) >0: TN=checkForCancellingEdges(FNedges) #this will also modify the isTrue value of FNedges for i in xrange(len(TN)): FN[i]=FN[i]-TN[i] if datout_fh: header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: edge=edgesim.edge prevals=",".join(map(str, [edgesim.refpreval, edge.prevalmean, edge.prevalsd])) orders=",".join(map(str, [edgesim.reforder, edge.ordermean, edge.ordersd])) type=edge.determineEventType() length=edge.get_Event_length() mystr="\t".join(map(str, [edge.id, types[type], edgesim.avecost, edge.likelihood, edge.cnval, edgesim.isTrue, length, prevals, orders, len(edge.histories)])) + "\n" datout_fh.write(mystr) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map(str, FP)), "\t".join(map(str, FN)), "\t".join(map(str, TN)) )) f1score = float(2*TP[0])/float(2*TP[0]+FN[0]+FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints=histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
def run(self): self.logToMaster("Setting up...") opts=self.options histseg.Global_BINWIDTH=opts.binwidth sampleid=opts.sampleid outputdir=opts.outputdir subprocess.call("mkdir -p %s" % outputdir, shell=True) historystatsfile=os.path.join(outputdir, "historystats.txt") pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") if opts.pevnts or not os.path.exists(pevntsfile): logger.info("pevntsfile: %s" % pevntsfile) CreatePevntsFile(pevntsfile, historystatsfile, opts).run() self.historyScores=np.loadtxt(historystatsfile, dtype=int) logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH) self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores) logger.info("totalp is %s" % str(self.totalp)) pedgesfile=os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, ignore_cn=False).run() mrgpedgesfile=os.path.join(outputdir, sampleid + ".mrgpedgs") if not os.path.exists(mrgpedgesfile): logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), mrgpedgesfile, self.historyScores, self.totalp, ignore_cn=True).run() linksfile =os.path.join(outputdir, sampleid +".links") if opts.links and not os.path.exists(linksfile): logger.info("linksfile: %s" % linksfile) self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) breaksfile=os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh=open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) annotationfile=os.path.join(outputdir, sampleid + ".ann") generankfile=os.path.join(outputdir, sampleid + ".gnrank") if opts.generank and not os.path.exists(generankfile): logger.info("generankfile: %s" % generankfile) if not self.events: self.events=pickle.load(open(pevntsfile, 'rb')) self.addChildTarget(CreateGeneRankFile(self.events, opts.tabixfile, self.totalp, annotationfile, generankfile)) logger.info("after creating generankfile") elif opts.ann and not opts.generank: logger.info("annotationfile: %s" % annotationfile) if not self.events: self.events=pickle.load(open(pevntsfile, 'rb')) self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) if opts.mcmcmix: mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts=pevntsfile opts.pedges=pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: simoutput=os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2=os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3=os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
parser.add_argument("--edges", help="write edges to this file.") parser.add_argument( "--prevalence_error", help="the difference in prevalences to be considered the same.", type=float, default=0.05 ) parser.add_argument( "--ignore_cn", help="merge together edges with different CN values.", default=False, action="store_true" ) parser.add_argument("--totalp", help="total probability of the histories", type=float) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood." ) add_score_edges_options(parser) args = parser.parse_args() allevents = pickle.load(open(args.pevnts, "rb")) historyScores = np.loadtxt(args.historystats, dtype=int) totalp = 0 if args.totalp: totalp = args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:, 0], historyScores) alledges = score_edges_within_pevents(allevents, historyScores, totalp, args.prevalence_error, args.ignore_cn) if args.edges: outfile = open(args.edges, "w") for edge in alledges: outfile.write(str(edge)) if args.outpickle: pickle.dump(alledges, open(args.outpickle, "wb"), pickle.HIGHEST_PROTOCOL)