Ejemplo n.º 1
0
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh,
                       breaks_fh, outdir):
    #	do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt"))
    #do_order_correction.main(events, 0, historyScores, usepreval=True)
    do_order_correction.main(events, 0, histScores)
    #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation
    histScores[np.where(histScores[:, 0] == refhistoryid), :] = 0
    totalp = histseg.compute_likelihood_histories(histScores[:, 0], histScores)
    sys.stderr.write("totalp is %f\n" % totalp)
    types = histseg.Global_EVENTTYPES
    TP = np.zeros(len(types), dtype=int)
    FP = np.zeros(len(types), dtype=int)
    TN = np.zeros(len(types), dtype=int)
    FN = np.zeros(len(types), dtype=int)
    explained = np.zeros(len(types), dtype=int)
    FNesims = []
    FPesims = []
    myEdgeSimData = []
    for event in events:
        myedgesim = EdgeSimulationData(event, histScores, totalp, refhistoryid)
        etype = event.determineEventType()
        if myedgesim.isTrue == 1:
            TP[0] += 1
            TP[etype] += 1
        elif myedgesim.isTrue == -1:
            FN[0] += 1
            FN[etype] += 1
            FNesims.append(myedgesim)
        elif myedgesim.isTrue == 0:
            FP[0] += 1
            FP[etype] += 1
            FPesims.append(myedgesim)
        myEdgeSimData.append(myedgesim)
    check_for_linear_decomp(FNesims, FPesims, explained)
    for i in xrange(len(TN)):
        FN[i] = FN[i] - TN[i]

    if datout_fh:
        header = "event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n"
        datout_fh.write(header)
        for edgesim in myEdgeSimData:
            datout_fh.write(str(edgesim))

    if stats_fh:
        stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n")
        stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" %
                       ("\t".join(map(str, TP)), "\t".join(map(
                           str, FP)), "\t".join(map(str, FN)), "\t".join(
                               map(str, TN)), "\t".join(map(str, explained))))
        tp = TP[0] + explained[0]
        fn = FN[0] - explained[0]
        f1score = float(2 * tp) / float(2 * tp + fn + FP[0])
        stats_fh.write("F1Score:\t%s\n" % (str(f1score)))

    if breaks_fh:
        breakpoints = histseg.get_breakpoints(edges, refhistoryid)
        for loc in breakpoints.keys():
            (n, t) = breakpoints[loc]
            breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t))
        breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
Ejemplo n.º 2
0
    def run(self):
        self.logToMaster("Setting up...")
        opts = self.options
        histseg.Global_BINWIDTH = opts.binwidth
        sampleid = opts.sampleid
        outputdir = opts.outputdir

        historystatsfile = os.path.join(outputdir, "historystats.txt")
        if not os.path.exists(historystatsfile):
            self.logToMaster("Creating historystats.txt...%s" %
                             historystatsfile)
            logger.info("historystatsfile: %s" % historystatsfile)
            pevntsjobtree.CombineHistoryStatsfiles(opts,
                                                   historystatsfile).run()
        self.historyScores = np.loadtxt(historystatsfile, dtype=int)
        self.totalp = histseg.compute_likelihood_histories(
            self.historyScores[:, 0], self.historyScores)
        #check that the *.pevnts file exists.
        pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts")
        if not os.path.exists(pevntsfile):
            sys.exit("The required %s file does not exist." % pevntsfile)

        pedgesfile = os.path.join(outputdir, sampleid + ".pedgs")
        if opts.pedges or not os.path.exists(pedgesfile):
            self.logToMaster("Creating pedgesfile...%s" % pedgesfile)
            logger.info("pedgesfile: %s" % pedgesfile)
            CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile,
                             self.historyScores, self.totalp, False).run()

        seghistfile = os.path.join(outputdir, "seghists.txt")
        if opts.sgh or not os.path.exists(seghistfile):
            self.logToMaster("Creating seghists file ... %s" % seghistfile)
            make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')),
                                          self.historyScores, seghistfile)

        # label the seghists if an annotation file is given
        if opts.bedfile:
            labeledfn = os.path.join(outputdir, "seghists.labeled")
            if not os.path.exists(labeledfn):
                pick_and_label_best_seghists.main(seghistfile, opts.bedfile,
                                                  True, labeledfn)
            geneordfn = os.path.join(outputdir, "geneords.txt")
            if opts.geneords or not os.path.exists(geneordfn):
                seghists_to_gene_orders.main(seghistfile, opts.bedfile,
                                             geneordfn)

        mrgpeventsfile = os.path.join(outputdir, sampleid + ".pmevnts")
        if not os.path.exists(mrgpeventsfile):
            self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile)
            logger.info("mrgpeventsfile: %s" % mrgpeventsfile)
            CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')),
                                   mrgpeventsfile, self.historyScores).run()

        mrgpedgesfile = os.path.join(outputdir, sampleid + ".pmedgs")
        if not os.path.exists(mrgpedgesfile):
            self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile)
            logger.info("mrgpedgesfile: %s" % mrgpedgesfile)
            CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')),
                                   mrgpedgesfile, self.historyScores).run()

        breaksfile = os.path.join(outputdir, "breakpoints.txt")
        if not os.path.exists(breaksfile):
            self.logToMaster("Creating breaksfile...%s" % breaksfile)
            breaklocs = histseg.get_breakpoints(
                pickle.load(open(pedgesfile, 'rb')), opts.trueID)
            breaklocs2 = histseg.get_breakpoints(
                pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID)
            breaksfh = open(breaksfile, 'w')
            for loc in sorted(breaklocs.keys()):
                (n, t) = breaklocs[loc]
                (n2, t2) = breaklocs2[loc]
                breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2))

        # Creating links is no longer an option.
        #linksfile =os.path.join(outputdir, sampleid +".links")
        #if opts.links and not os.path.exists(linksfile):
        #	self.logToMaster("Creating linksfile...%s" % linksfile)
        #	logger.info("linksfile: %s" % linksfile)
        #	self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp))

        #Annotating Events is no longer an option. Seghists are annotated instead.
        #annotationfile=os.path.join(outputdir, "evnts.ann")
        #only create the annotations file here if we aren't doing gene ranking.  Otherwise the gene rank option will create the annotation file for itself.
        #if opts.ann and not opts.generank and not os.path.exists(annotationsfile):
        #	logger.info("annotationfile: %s" % annotationfile)
        #	if not self.events:
        #		self.events=pickle.load(open(pevntsfile, 'rb'))
        #	self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile))

        # generank isn't an option - geneords is done instead using seghists.
        #generankfile=os.path.join(outputdir, "generanks.txt")
        # annotation file comes before generankfile (gene ranking depends on annotations.)
        #if opts.generank and not os.path.exists(generankfile):
        #	self.logToMaster("Creating generankfile: %s" % generankfile)
        #	logger.info("generankfile: %s" % generankfile)
        #	if not self.events:
        #		self.events=pickle.load(open(pevntsfile, 'rb'))
        #		self.historyScores=np.loadtxt(historystatsfile, dtype=int)

        if opts.mcmcmix:
            self.logToMaster("Setting up MCMC analysis")
            mcmcdir = os.path.join(outputdir, "mcmcdata")
            mcmcdat = os.path.join(mcmcdir, "edge_counts.dat")
            mcmcdir = os.path.join(outputdir, "mcmcdata")
            mcmcdat = os.path.join(mcmcdir, "edge_counts.dat")
            if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat):
                subprocess.call("mkdir -p %s" % mcmcdir, shell=True)
                opts.pevnts = pevntsfile
                opts.pedges = pedgesfile
                self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir))

        if opts.simulation:
            self.logToMaster("Setting up Simulation analysis")
            simoutput = os.path.join(outputdir, "events.stats")
            if ((not os.path.exists(simoutput))
                    or (os.path.getsize(simoutput) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores,
                                   "events", outputdir, opts.binwidth))
            simoutput2 = os.path.join(outputdir, "edges.stats")
            if ((not os.path.exists(simoutput2))
                    or (os.path.getsize(simoutput2) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores,
                                   "edges", outputdir, opts.binwidth))
            simoutput3 = os.path.join(outputdir, "mrgedges.stats")
            if ((not os.path.exists(simoutput3))
                    or (os.path.getsize(simoutput3) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(mrgpedgesfile, opts.trueID,
                                   self.historyScores, "mrgedges", outputdir,
                                   opts.binwidth))
Ejemplo n.º 3
0
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir):
#	do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt"))
	#do_order_correction.main(events, 0, historyScores, usepreval=True)
	do_order_correction.main(events, 0, histScores)
	#make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation 
	histScores[np.where(histScores[:,0] == refhistoryid),:]=0	 
	totalp=histseg.compute_likelihood_histories(histScores[:,0], histScores)
	sys.stderr.write("totalp is %f\n" % totalp)
	types=histseg.Global_EVENTTYPES 
	TP=np.zeros(len(types), dtype=int)
	FP=np.zeros(len(types), dtype=int)
	TN=np.zeros(len(types), dtype=int)
	FN=np.zeros(len(types), dtype=int)
	explained=np.zeros(len(types), dtype=int)
	FNesims=[]
	FPesims=[]
	myEdgeSimData=[] 
	for event in events:
		myedgesim=EdgeSimulationData(event, histScores, totalp, refhistoryid)
		etype=event.determineEventType()
		if myedgesim.isTrue==1: 
			TP[0]+=1
			TP[etype]+=1
		elif myedgesim.isTrue==-1: 
			FN[0]+=1
			FN[etype]+=1
			FNesims.append(myedgesim)
		elif myedgesim.isTrue==0:  
			FP[0]+=1
			FP[etype]+=1
			FPesims.append(myedgesim)
		myEdgeSimData.append(myedgesim)
	check_for_linear_decomp(FNesims, FPesims, explained)
	for i in xrange(len(TN)): 
		FN[i]=FN[i]-TN[i]	
	
	if datout_fh: 
		header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n"
		datout_fh.write(header)
		for edgesim in myEdgeSimData:
			datout_fh.write(str(edgesim))
	
	if stats_fh: 
		stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n")
		stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % 
			("\t".join(map(str, TP)), 
			"\t".join(map(str, FP)), 
			"\t".join(map(str, FN)), 
			"\t".join(map(str, TN)),
			"\t".join(map(str, explained)) ))
		tp=TP[0]+explained[0]
		fn=FN[0]-explained[0]
		f1score = float(2*tp)/float(2*tp+fn+FP[0])
		stats_fh.write("F1Score:\t%s\n" % (str(f1score)))
	
	if breaks_fh: 
		breakpoints=histseg.get_breakpoints(edges, refhistoryid)
		for loc in breakpoints.keys(): 
			(n, t) = breakpoints[loc]
			breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t))
		breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
Ejemplo n.º 4
0
	def run(self):
		self.logToMaster("Setting up...") 
		opts=self.options
		histseg.Global_BINWIDTH=opts.binwidth
		sampleid=opts.sampleid
		outputdir=opts.outputdir
	
		historystatsfile=os.path.join(outputdir, "historystats.txt")
		if not os.path.exists(historystatsfile): 
			self.logToMaster("Creating historystats.txt...%s" % historystatsfile) 
			logger.info("historystatsfile: %s" % historystatsfile)
			pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run()
		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores)
		#check that the *.pevnts file exists. 
		pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts")
		if not os.path.exists(pevntsfile): 
			sys.exit("The required %s file does not exist." % pevntsfile)
	
		pedgesfile=os.path.join(outputdir, sampleid + ".pedgs")
		if opts.pedges or not os.path.exists(pedgesfile):
			self.logToMaster("Creating pedgesfile...%s" % pedgesfile) 
			logger.info("pedgesfile: %s" % pedgesfile)
			CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run()
		
		seghistfile=os.path.join(outputdir, "seghists.txt")
		if opts.sgh or not os.path.exists(seghistfile): 
			self.logToMaster("Creating seghists file ... %s" % seghistfile)
			make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile)
			
		# label the seghists if an annotation file is given
		if opts.bedfile: 
			labeledfn=os.path.join(outputdir, "seghists.labeled")
			if not os.path.exists(labeledfn):
				pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn)
			geneordfn=os.path.join(outputdir, "geneords.txt")
			if opts.geneords or not os.path.exists(geneordfn):
				seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn)

		mrgpeventsfile=os.path.join(outputdir, sampleid + ".pmevnts")
		if not os.path.exists(mrgpeventsfile):
			self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) 
			logger.info("mrgpeventsfile: %s" % mrgpeventsfile)
			CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run()
		
		mrgpedgesfile=os.path.join(outputdir, sampleid + ".pmedgs")
		if not os.path.exists(mrgpedgesfile):
			self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) 
			logger.info("mrgpedgesfile: %s" % mrgpedgesfile)
			CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run()
	
		breaksfile=os.path.join(outputdir, "breakpoints.txt")
		if not os.path.exists(breaksfile): 
			self.logToMaster("Creating breaksfile...%s" % breaksfile) 
			breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID)
			breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID)
			breaksfh=open(breaksfile, 'w')
			for loc in sorted(breaklocs.keys()):
				(n, t) = breaklocs[loc]
				(n2, t2) = breaklocs2[loc]
				breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2))	
		
		# Creating links is no longer an option.  
		#linksfile =os.path.join(outputdir, sampleid +".links")
		#if opts.links and not os.path.exists(linksfile): 
		#	self.logToMaster("Creating linksfile...%s" % linksfile) 
		#	logger.info("linksfile: %s" % linksfile)
		#	self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp))		

		#Annotating Events is no longer an option. Seghists are annotated instead.
		#annotationfile=os.path.join(outputdir, "evnts.ann")
		#only create the annotations file here if we aren't doing gene ranking.  Otherwise the gene rank option will create the annotation file for itself.  
		#if opts.ann and not opts.generank and not os.path.exists(annotationsfile): 
		#	logger.info("annotationfile: %s" % annotationfile)
		#	if not self.events: 
		#		self.events=pickle.load(open(pevntsfile, 'rb'))
		#	self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile))
	
		# generank isn't an option - geneords is done instead using seghists. 		
		#generankfile=os.path.join(outputdir, "generanks.txt")
		# annotation file comes before generankfile (gene ranking depends on annotations.) 
		#if opts.generank and not os.path.exists(generankfile): 
		#	self.logToMaster("Creating generankfile: %s" % generankfile)
		#	logger.info("generankfile: %s" % generankfile)
		#	if not self.events: 
		#		self.events=pickle.load(open(pevntsfile, 'rb'))
		#		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		
		if opts.mcmcmix: 	
			self.logToMaster("Setting up MCMC analysis")
			mcmcdir=os.path.join(outputdir, "mcmcdata")
			mcmcdat=os.path.join(mcmcdir, "edge_counts.dat")
			mcmcdir=os.path.join(outputdir, "mcmcdata")
			mcmcdat=os.path.join(mcmcdir, "edge_counts.dat")
			if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat):
				subprocess.call("mkdir -p %s" % mcmcdir, shell=True)	
				opts.pevnts=pevntsfile 
				opts.pedges=pedgesfile
				self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir))

		if opts.simulation: 
			self.logToMaster("Setting up Simulation analysis")
			simoutput=os.path.join(outputdir, "events.stats")
			if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): 
				self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth))
			simoutput2=os.path.join(outputdir, "edges.stats")
			if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): 
				self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth))
			simoutput3=os.path.join(outputdir, "mrgedges.stats")
			if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): 
				self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))