def score_edges_within_pevents(allevents, historyScores, totalp, prev_error=0.05, ignore_cn=True):
    prevalence_error = prev_error
    sys.stderr.write("number of events: %d\n" % (len(allevents)))
    alledges = []
    for event in allevents:
        event.unpack()
        # 	if event.segs == []:
        # 		event.make_segs_from_str()
        for seg in event.segs:
            edge = copy.deepcopy(event)
            edge.segs = [seg]
            edge.make_segstr()
            if ignore_cn:
                (mysegstr, sign) = histseg.remove_signs_from_segstr(edge.segstr)
                edge.segstr = "+/" + mysegstr
                if sign == "-":
                    edge.cnval = -1 * event.cnval
            alledges.append(edge)
            # 	sys.stderr.write("number of edges is: %d\n" % (len(alledges)))
    sortededges = sorted(alledges, key=lambda x: (x.segstr, x.cnval))
    if ignore_cn:
        unique_edges = unique_loc_edges(sortededges)
    else:
        unique_edges = unique_c_edges(sortededges)
    sys.stderr.write("totalp: %s\n" % (str(totalp)))
    for edge in unique_edges:
        edge.update(historyScores)
        edge.likelihood = histseg.compute_likelihood_histories(edge.histories, historyScores, totalp)
        edge.trim()
    return unique_edges
def score_and_link_cycles(args):
	if args.binwidth: 
		histseg.Global_BINWIDTH=args.binwidth
	if not args.historystats and args.cnavg: 	
		historyScores=histseg.combine_history_statsfiles(args.cnavg)
	elif not os.path.isfile(args.historystats) and args.cnavg: 
		historyScores=histseg.combine_history_statsfiles(args.cnavg)
		np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t')
	elif os.path.isfile(args.historystats):
		historyScores=np.loadtxt(args.historystats, dtype=int)
	if not historyScores: 
		sys.exit("Need to use --historystats or --cnavg option")
	totalp=0
	if args.totalp: 
		totalp=args.totalp
	else: 
		totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores)
	allevents=[]
	if args.cnavg: 
		sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg))
		allevents=histseg.get_events_from_cnavgdir(args.cnavg, historyScores, totalp)
	elif args.inpickle and os.path.isfile(args.inpickle): 
		sys.stderr.write("using pickle file\n")
		allevents=pickle.load(open(args.inpickle, 'rb'))
	sys.stderr.write("there are %d events\n" % (len(allevents)))
	if args.outpickle: 
		for event in allevents: 
			event.trim()
		eventfile= open(args.outpickle, 'wb')
		pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL)
	if args.events: 
		eventfile=open(args.events, 'w')
		for evnt in allevents: 
			eventfile.write("%s" % (str(evnt)))
	# link the events...
	if args.links: 
		if not allevents: 
			sys.exit("Need events to link!  use --inpickle or --cnavg or --events")
		if not totalp: 
			sys.exit("Need a --totalp or --cnavg or --historystats options")	
		eventlinks = link_events_by_order_within_histories(allevents)
		linkfile=open(args.links, 'w')
		for link in eventlinks: 
			link.likelihood=histseg.compute_likelihood_histories(link.histories, historyScores)
			linkfile.write("%s" % (str(link)))
	def run(self):
		self.logToMaster("CreatePevntsFile\n") 
		opts=self.options
		if opts.simulation:
			truefile=os.path.join(opts.cnavgout, "true.braney")
			truehist=os.path.join(opts.cnavgout, "HISTORIES_0.braney") 
			subprocess.call("grep -v ^$ %s | gzip > %s" % (truefile, truehist), shell=True)
			make_STATS_from_truebraney(truefile, os.path.join(opts.cnavgout, "HISTORY_STATS_0"))
		historyScores=histseg.combine_history_statsfiles(opts.cnavgout)
		np.savetxt(self.historystatsfile, historyScores, fmt='%d', delimiter='\t')	
		totalp=histseg.compute_likelihood_histories(historyScores[:,0], historyScores)
		events=histseg.get_events_from_cnavgdir(opts.cnavgout, historyScores, totalp)
		pickle.dump(events, open(self.pevntsfile, 'wb'), pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 4
0
def analyze_simulation(edges, refhistoryid, historyScores, datout_fh, stats_fh, breaks_fh):
	#make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation 
	myhistScores=np.copy(historyScores)
	myhistScores[np.where(historyScores[:,0] == refhistoryid),:]=0	 
	totalp=histseg.compute_likelihood_histories(myhistScores[:,0], myhistScores)
	TP=[0,0,0,0]
	FP=[0,0,0,0]
	TN=[0,0,0,0]
	FN=[0,0,0,0]
	FNedges=[]
	types=histseg.Global_EVENTTYPES
	myEdgeSimData=[] # a list of tuples (edge, isTrue, refpreval, reforder)
	for edge in edges:
		if not edge.histories: edge.histories=histseg.listout_ranges(edge.histRanges)
		myedgesim=EdgeSimulationData(edge)
		type=myedgesim.type
		if refhistoryid in edge.histories: 
			refindex=edge.histories.index(refhistoryid)
			myedgesim.refindex=refindex
			if len(edge.histories)>1:
				TP[0]+=1
				TP[type]+=1
				myedgesim.isTrue=1
				edge.histories.pop(refindex)
				myedgesim.refpreval=edge.prevals.pop(refindex)
				myedgesim.reforder=edge.orders.pop(refindex)
				edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp)	
				edge.compute_timing_wmeansd(myhistScores)
				edge.histories.insert(refindex, refhistoryid)	
				edge.prevals.insert(refindex, myedgesim.refpreval)	
				edge.orders.insert(refindex, myedgesim.reforder)	
				upperc=edge.uppercosts.pop(refindex)
				lowerc=edge.lowercosts.pop(refindex)
				myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts))
				edge.uppercosts.insert(refindex, upperc)	
				edge.lowercosts.insert(refindex, lowerc)	
			else: 
				FN[0]+=1
				FN[type]+=1
				FNedges.append(myedgesim)
				myedgesim.isTrue=-1
				edge.likelihood=1
				myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts))
				myedgesim.refpreval=edge.prevals[refindex]
				myedgesim.reforder=edge.orders[refindex]
		else: 
			FP[0]+=1
			FP[type]+=1
			edge.likelihood = histseg.compute_likelihood_histories(edge.histories, myhistScores, totalp)	
			if edge.likelihood >1: 
				sys.stderr.write("bad lscore: %s\t%s\t%d\n" % (str(edge.likelihood), str(totalp), len(edge.costs)))
			myedgesim.isTrue=0
			myedgesim.avecost=np.mean(np.array(edge.uppercosts+edge.lowercosts))
		myEdgeSimData.append(myedgesim)
	if len(FNedges) >0: 
		TN=checkForCancellingEdges(FNedges) #this will also modify the isTrue value of FNedges
		for i in xrange(len(TN)): 
			FN[i]=FN[i]-TN[i]	
	
	if datout_fh: 
		header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n"
		datout_fh.write(header)
		for edgesim in myEdgeSimData:
			edge=edgesim.edge 
			prevals=",".join(map(str, [edgesim.refpreval, edge.prevalmean, edge.prevalsd]))
			orders=",".join(map(str, [edgesim.reforder, edge.ordermean, edge.ordersd]))
			type=edge.determineEventType()
			length=edge.get_Event_length()
			mystr="\t".join(map(str, [edge.id, types[type], edgesim.avecost, edge.likelihood, edge.cnval, edgesim.isTrue, length, prevals, orders, len(edge.histories)])) + "\n"	
			datout_fh.write(mystr)
	
	if stats_fh: 
		stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n")
		stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map(str, FP)), "\t".join(map(str, FN)), "\t".join(map(str, TN)) ))
		f1score = float(2*TP[0])/float(2*TP[0]+FN[0]+FP[0])
		stats_fh.write("F1Score:\t%s\n" % (str(f1score)))
	
	if breaks_fh: 
		breakpoints=histseg.get_breakpoints(edges, refhistoryid)
		for loc in breakpoints.keys(): 
			(n, t) = breakpoints[loc]
			breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t))
		breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
	def run(self):
		self.logToMaster("Setting up...") 
		opts=self.options
		histseg.Global_BINWIDTH=opts.binwidth
		sampleid=opts.sampleid
		outputdir=opts.outputdir
		subprocess.call("mkdir -p %s" % outputdir, shell=True)
		historystatsfile=os.path.join(outputdir, "historystats.txt")
		pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts")
		if opts.pevnts or not os.path.exists(pevntsfile): 
			logger.info("pevntsfile: %s" % pevntsfile)
			CreatePevntsFile(pevntsfile, historystatsfile, opts).run()
		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH)
		self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores)
		logger.info("totalp is %s" % str(self.totalp))	
		
		pedgesfile=os.path.join(outputdir, sampleid + ".pedgs")
		if opts.pedges or not os.path.exists(pedgesfile):
			logger.info("pedgesfile: %s" % pedgesfile)
			CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, ignore_cn=False).run()
		
		mrgpedgesfile=os.path.join(outputdir, sampleid + ".mrgpedgs")
		if not os.path.exists(mrgpedgesfile):
			logger.info("mrgpedgesfile: %s" % mrgpedgesfile)
			CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), mrgpedgesfile, self.historyScores, self.totalp, ignore_cn=True).run()
		
		linksfile =os.path.join(outputdir, sampleid +".links")
		if opts.links and not os.path.exists(linksfile): 
			logger.info("linksfile: %s" % linksfile)
			self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp))		
	
		breaksfile=os.path.join(outputdir, "breakpoints.txt")
		if not os.path.exists(breaksfile): 
			breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID)
			breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID)
			breaksfh=open(breaksfile, 'w')
			for loc in sorted(breaklocs.keys()):
				(n, t) = breaklocs[loc]
				(n2, t2) = breaklocs2[loc]
				breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2))	
	
		annotationfile=os.path.join(outputdir, sampleid + ".ann")
		generankfile=os.path.join(outputdir, sampleid + ".gnrank")
		if opts.generank and not os.path.exists(generankfile): 
			logger.info("generankfile: %s" % generankfile)
			if not self.events: 
				self.events=pickle.load(open(pevntsfile, 'rb'))
			self.addChildTarget(CreateGeneRankFile(self.events, opts.tabixfile, self.totalp, annotationfile, generankfile))	
			logger.info("after creating generankfile")
		elif opts.ann and not opts.generank: 
			logger.info("annotationfile: %s" % annotationfile)
			if not self.events: 
				self.events=pickle.load(open(pevntsfile, 'rb'))
			self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile))
		
		if opts.mcmcmix: 	
			mcmcdir=os.path.join(outputdir, "mcmcdata")
			mcmcdat=os.path.join(mcmcdir, "edge_counts.dat")
			if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat):
				subprocess.call("mkdir -p %s" % mcmcdir, shell=True)	
				opts.pevnts=pevntsfile 
				opts.pedges=pedgesfile
				self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir))

		if opts.simulation: 
			simoutput=os.path.join(outputdir, "events.stats")
			if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): 
				self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth))
			simoutput2=os.path.join(outputdir, "edges.stats")
			if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): 
				self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth))
			simoutput3=os.path.join(outputdir, "mrgedges.stats")
			if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): 
				self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
    parser.add_argument("--edges", help="write edges to this file.")
    parser.add_argument(
        "--prevalence_error", help="the difference in prevalences to be considered the same.", type=float, default=0.05
    )
    parser.add_argument(
        "--ignore_cn", help="merge together edges with different CN values.", default=False, action="store_true"
    )
    parser.add_argument("--totalp", help="total probability of the histories", type=float)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood."
    )
    add_score_edges_options(parser)
    args = parser.parse_args()
    allevents = pickle.load(open(args.pevnts, "rb"))
    historyScores = np.loadtxt(args.historystats, dtype=int)
    totalp = 0
    if args.totalp:
        totalp = args.totalp
    else:
        totalp = histseg.compute_likelihood_histories(historyScores[:, 0], historyScores)
    alledges = score_edges_within_pevents(allevents, historyScores, totalp, args.prevalence_error, args.ignore_cn)
    if args.edges:
        outfile = open(args.edges, "w")
        for edge in alledges:
            outfile.write(str(edge))
    if args.outpickle:
        pickle.dump(alledges, open(args.outpickle, "wb"), pickle.HIGHEST_PROTOCOL)