Exemple #1
0
def score_and_link_cycles(args):
    if args.binwidth:
        histseg.Global_BINWIDTH = args.binwidth
    if not args.historystats and args.cnavg:
        historyScores = histseg.combine_history_statsfiles(args.cnavg)
    elif not os.path.isfile(args.historystats) and args.cnavg:
        historyScores = histseg.combine_history_statsfiles(args.cnavg)
        np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t')
    elif os.path.isfile(args.historystats):
        historyScores = np.loadtxt(args.historystats, dtype=int)
    if not historyScores:
        sys.exit("Need to use --historystats or --cnavg option")
    totalp = 0
    if args.totalp:
        totalp = args.totalp
    else:
        totalp = histseg.compute_likelihood_histories(historyScores[:, 0],
                                                      historyScores)
    allevents = []
    if args.cnavg:
        sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg))
        allevents = histseg.get_events_from_cnavgdir(args.cnavg, historyScores,
                                                     totalp)
    elif args.inpickle and os.path.isfile(args.inpickle):
        sys.stderr.write("using pickle file\n")
        allevents = pickle.load(open(args.inpickle, 'rb'))
    sys.stderr.write("there are %d events\n" % (len(allevents)))
    if args.outpickle:
        for event in allevents:
            event.trim()
        eventfile = open(args.outpickle, 'wb')
        pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL)
    if args.events:
        eventfile = open(args.events, 'w')
        for evnt in allevents:
            eventfile.write("%s" % (str(evnt)))
    # link the events...
    if args.links:
        if not allevents:
            sys.exit(
                "Need events to link!  use --inpickle or --cnavg or --events")
        if not totalp:
            sys.exit("Need a --totalp or --cnavg or --historystats options")
        eventlinks = link_events_by_order_within_histories(allevents)
        linkfile = open(args.links, 'w')
        for link in eventlinks:
            link.likelihood = histseg.compute_likelihood_histories(
                link.histories, historyScores)
            linkfile.write("%s" % (str(link)))
Exemple #2
0
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh,
                       breaks_fh, outdir):
    #	do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt"))
    #do_order_correction.main(events, 0, historyScores, usepreval=True)
    do_order_correction.main(events, 0, histScores)
    #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation
    histScores[np.where(histScores[:, 0] == refhistoryid), :] = 0
    totalp = histseg.compute_likelihood_histories(histScores[:, 0], histScores)
    sys.stderr.write("totalp is %f\n" % totalp)
    types = histseg.Global_EVENTTYPES
    TP = np.zeros(len(types), dtype=int)
    FP = np.zeros(len(types), dtype=int)
    TN = np.zeros(len(types), dtype=int)
    FN = np.zeros(len(types), dtype=int)
    explained = np.zeros(len(types), dtype=int)
    FNesims = []
    FPesims = []
    myEdgeSimData = []
    for event in events:
        myedgesim = EdgeSimulationData(event, histScores, totalp, refhistoryid)
        etype = event.determineEventType()
        if myedgesim.isTrue == 1:
            TP[0] += 1
            TP[etype] += 1
        elif myedgesim.isTrue == -1:
            FN[0] += 1
            FN[etype] += 1
            FNesims.append(myedgesim)
        elif myedgesim.isTrue == 0:
            FP[0] += 1
            FP[etype] += 1
            FPesims.append(myedgesim)
        myEdgeSimData.append(myedgesim)
    check_for_linear_decomp(FNesims, FPesims, explained)
    for i in xrange(len(TN)):
        FN[i] = FN[i] - TN[i]

    if datout_fh:
        header = "event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n"
        datout_fh.write(header)
        for edgesim in myEdgeSimData:
            datout_fh.write(str(edgesim))

    if stats_fh:
        stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n")
        stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" %
                       ("\t".join(map(str, TP)), "\t".join(map(
                           str, FP)), "\t".join(map(str, FN)), "\t".join(
                               map(str, TN)), "\t".join(map(str, explained))))
        tp = TP[0] + explained[0]
        fn = FN[0] - explained[0]
        f1score = float(2 * tp) / float(2 * tp + fn + FP[0])
        stats_fh.write("F1Score:\t%s\n" % (str(f1score)))

    if breaks_fh:
        breakpoints = histseg.get_breakpoints(edges, refhistoryid)
        for loc in breakpoints.keys():
            (n, t) = breakpoints[loc]
            breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t))
        breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
    def run(self):
        self.logToMaster("Setting up...")
        opts = self.options
        histseg.Global_BINWIDTH = opts.binwidth
        sampleid = opts.sampleid
        outputdir = opts.outputdir
        subprocess.call("mkdir -p %s" % outputdir, shell=True)

        historystatsfile = os.path.join(outputdir, "historystats.txt")
        if not os.path.exists(historystatsfile):
            self.logToMaster("Creating historystats.txt...%s" %
                             historystatsfile)
            logger.info("historystatsfile: %s" % historystatsfile)
            pevntsjobtree.CombineHistoryStatsfiles(opts,
                                                   historystatsfile).run()
        self.historyScores = np.loadtxt(historystatsfile, dtype=int)
        self.totalp = histseg.compute_likelihood_histories(
            self.historyScores[:, 0], self.historyScores)
        logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH)
        logger.info("totalp is %s" % str(self.totalp))

        pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts")
        if opts.pevnts or not os.path.exists(pevntsfile):
            self.logToMaster("Creating pevntsfile...%s" % pevntsfile)
            logger.info("pevntsfile: %s" % pevntsfile)
            self.addChildTarget(
                pevntsjobtree.CreatePevntsFile(pevntsfile, self.historyScores,
                                               self.totalp, opts))
        self.setFollowOnTarget(DoAnalysisOfMergedEvents(opts))
def score_edges_within_pevents(allevents,
                               historyScores,
                               totalp,
                               prev_error=0.05,
                               ignore_cn=True):
    prevalence_error = prev_error
    sys.stderr.write("number of events: %d\n" % (len(allevents)))
    sys.stderr.write("ignore_cn: %s\n" % ignore_cn)
    alledges = []
    for event in allevents:
        event.unpack()
        for seg in event.segs:
            edge = copy.deepcopy(event)
            edge.segs = [seg]
            edge.make_segstr()
            alledges.append(edge)
    if ignore_cn:
        unique_edges = unique_loc_edges(alledges)
    else:
        unique_edges = histseg.unique_c_events_list(alledges)
    sys.stderr.write("totalp: %s\n" % (str(totalp)))
    for edge in unique_edges:
        edge.update(historyScores)
        edge.likelihood = histseg.compute_likelihood_histories(
            edge.histories, historyScores, totalp)
        edge.trim()
    return unique_edges
Exemple #5
0
 def update_for_true_event(self, event, refhistoryid, histScores, totalp):
     refindex = event.histories.index(refhistoryid)
     # need to pop off the simulated history values and recompute the likelihood for the event
     if len(event.histories) > 1:
         event.histories.pop(refindex)
         refpreval = event.prevals.pop(refindex)
         reforder = event.orders.pop(refindex)
         event.likelihood = histseg.compute_likelihood_histories(
             event.histories, histScores, totalp)
         event.numhists = len(event.histories)
         event.compute_timing_wmeansd(histScores)
         event.histories.insert(refindex, refhistoryid)  # reinsert these
         event.prevals.insert(refindex, refpreval)
         event.orders.insert(refindex, reforder)
         upperc = event.uppercosts.pop(refindex)
         lowerc = event.lowercosts.pop(refindex)
         avecost = np.mean(np.array(event.uppercosts + event.lowercosts))
         event.uppercosts.insert(refindex, upperc)
         event.lowercosts.insert(refindex, lowerc)
     else:
         refpreval = event.prevals[refindex]
         reforder = event.orders[refindex]
         avecost = np.mean(np.array(event.uppercosts + event.lowercosts))
         event.likelihood = 0
     (self.refpreval, self.reforder, self.avecost) = (refpreval, reforder,
                                                      avecost)
def score_and_link_cycles(args):
	if args.binwidth: 
		histseg.Global_BINWIDTH=args.binwidth
	if not args.historystats and args.cnavg: 	
		historyScores=histseg.combine_history_statsfiles(args.cnavg)
	elif not os.path.isfile(args.historystats) and args.cnavg: 
		historyScores=histseg.combine_history_statsfiles(args.cnavg)
		np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t')
	elif os.path.isfile(args.historystats):
		historyScores=np.loadtxt(args.historystats, dtype=int)
	if not historyScores: 
		sys.exit("Need to use --historystats or --cnavg option")
	totalp=0
	if args.totalp: 
		totalp=args.totalp
	else: 
		totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores)
	allevents=[]
	if args.cnavg: 
		sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg))
		allevents=histseg.get_events_from_cnavgdir(args.cnavg, historyScores, totalp)
	elif args.inpickle and os.path.isfile(args.inpickle): 
		sys.stderr.write("using pickle file\n")
		allevents=pickle.load(open(args.inpickle, 'rb'))
	sys.stderr.write("there are %d events\n" % (len(allevents)))
	if args.outpickle: 
		for event in allevents: 
			event.trim()
		eventfile= open(args.outpickle, 'wb')
		pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL)
	if args.events: 
		eventfile=open(args.events, 'w')
		for evnt in allevents: 
			eventfile.write("%s" % (str(evnt)))
	# link the events...
	if args.links: 
		if not allevents: 
			sys.exit("Need events to link!  use --inpickle or --cnavg or --events")
		if not totalp: 
			sys.exit("Need a --totalp or --cnavg or --historystats options")	
		eventlinks = link_events_by_order_within_histories(allevents)
		linkfile=open(args.links, 'w')
		for link in eventlinks: 
			link.likelihood=histseg.compute_likelihood_histories(link.histories, historyScores)
			linkfile.write("%s" % (str(link)))
	def run(self): 
		self.logToMaster("Merging .braney files...")
		opts=self.options
		outputdir=self.outputdir
		#global_dir=self.getGlobalTempDir()
		global_dir=self.outputdir
		cmd = "mkdir -p %s" % global_dir 
		logger.info(cmd)
		subprocess.call(cmd, shell=True)
		# get historystats
		historystatsfile=os.path.join(outputdir, "historystats.txt")
		if not os.path.exists(historystatsfile): 
			self.logToMaster("Creating historystats.txt...%s" % historystatsfile)
			logger.info("historystatsfile: %s" % historystatsfile)
			CombineHistoryStatsfiles(opts, historystatsfile).run()
		historyScores=np.loadtxt(historystatsfile, dtype=int)
		totalp=histseg.compute_likelihood_histories(historyScores[:,0], historyScores)
		pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts")
		self.addChildTarget(CreatePevntsFile(pevntsfile, historyScores, totalp, opts))
def create_CNprofiles_from_Edges(self, histScores, totalp=0):
    edgelist = self.esegs
    mycnprofs = []
    if totalp == 0: totalp = ecycles.compute_totalp(historyScores)
    (hprofiles, pprofiles) = create_profile_matrices(edgelist, histScores)
    (ridx, profiles) = get_unique_rows(hprofiles)
    goodi = np.where(np.sum(profiles != 0, axis=1) > 0)[0]
    for i in goodi:
        mycnp = CNprofile()
        mycnp.likelihood = ecycles.compute_likelihood_histories(
            histScores[ridx == i, 0], histScores, totalp)
        mycnp.numhists = np.sum(ridx == i)
        cnvals = profiles[i, :]
        pvals = pprofiles[ridx == i, :]
        pvalsd = np.std(pvals, axis=0)
        pvalsm = np.mean(pvals, axis=0)
        mycnp.pvals = pvalsm[cnvals != 0].tolist()
        mycnp.pvalsd = pvalsd[cnvals != 0].tolist()
        mycnp.cnvals = cnvals[cnvals != 0].tolist()
        mycnprofs.append(mycnp)
    self.CNprofiles = mycnprofs
def create_CNprofiles_from_Edges(self, histScores, totalp=0):
    edgelist = self.esegs
    mycnprofs = []
    if totalp == 0:
        totalp = ecycles.compute_totalp(historyScores)
    (hprofiles, pprofiles) = create_profile_matrices(edgelist, histScores)
    (ridx, profiles) = get_unique_rows(hprofiles)
    goodi = np.where(np.sum(profiles != 0, axis=1) > 0)[0]
    for i in goodi:
        mycnp = CNprofile()
        mycnp.likelihood = ecycles.compute_likelihood_histories(histScores[ridx == i, 0], histScores, totalp)
        mycnp.numhists = np.sum(ridx == i)
        cnvals = profiles[i, :]
        pvals = pprofiles[ridx == i, :]
        pvalsd = np.std(pvals, axis=0)
        pvalsm = np.mean(pvals, axis=0)
        mycnp.pvals = pvalsm[cnvals != 0].tolist()
        mycnp.pvalsd = pvalsd[cnvals != 0].tolist()
        mycnp.cnvals = cnvals[cnvals != 0].tolist()
        mycnprofs.append(mycnp)
    self.CNprofiles = mycnprofs
 def run(self):
     self.logToMaster("Merging .braney files...")
     opts = self.options
     outputdir = self.outputdir
     #global_dir=self.getGlobalTempDir()
     global_dir = self.outputdir
     cmd = "mkdir -p %s" % global_dir
     logger.info(cmd)
     subprocess.call(cmd, shell=True)
     # get historystats
     historystatsfile = os.path.join(outputdir, "historystats.txt")
     if not os.path.exists(historystatsfile):
         self.logToMaster("Creating historystats.txt...%s" %
                          historystatsfile)
         logger.info("historystatsfile: %s" % historystatsfile)
         CombineHistoryStatsfiles(opts, historystatsfile).run()
     historyScores = np.loadtxt(historystatsfile, dtype=int)
     totalp = histseg.compute_likelihood_histories(historyScores[:, 0],
                                                   historyScores)
     pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts")
     self.addChildTarget(
         CreatePevntsFile(pevntsfile, historyScores, totalp, opts))
def score_edges_within_pevents(allevents, historyScores, totalp, prev_error=0.05, ignore_cn=True): 
	prevalence_error=prev_error
	sys.stderr.write("number of events: %d\n" % (len(allevents)))
	sys.stderr.write("ignore_cn: %s\n" % ignore_cn) 
	alledges=[]
	for event in allevents:
		event.unpack() 
		for seg in event.segs: 
			edge=copy.deepcopy(event)
			edge.segs=[seg]
			edge.make_segstr()
			alledges.append(edge)
	if ignore_cn:
		unique_edges=unique_loc_edges(alledges)
	else: 
		unique_edges=histseg.unique_c_events_list(alledges)
	sys.stderr.write("totalp: %s\n" % (str(totalp)))
	for edge in unique_edges: 
		edge.update(historyScores)
		edge.likelihood=histseg.compute_likelihood_histories(edge.histories, historyScores, totalp)
		edge.trim()
	return unique_edges 
Exemple #12
0
	def __init__(self, event, histScores, totalp, refhistoryid=0): 
		self.event=event
		#(segstr, self.sign)=histseg.remove_signs_from_segstr(event.segstr)
		self.sign=1
		if event.cnval <0: 
			self.sign=-1
		segstr=event.segstr
		self.cnval=event.cnval*self.sign		
		self.isTrue=0  # this will be 0 if edge is FP, 1 if TP, 2 if TN, -1 if FN, and 3 if it's a linear combination of true events.  
		self.refindex=-1
		self.refpreval=-1
		self.reforder=-1
		self.avecost=-1
		if refhistoryid in event.histories: 
			self.update_for_true_event(event, refhistoryid, histScores, totalp)
			if len(event.histories)>1:
				self.isTrue=1
			else: 
				self.isTrue=-1
		else: 
			self.isTrue=0
			event.likelihood=histseg.compute_likelihood_histories(event.histories, histScores, totalp)
Exemple #13
0
 def __init__(self, event, histScores, totalp, refhistoryid=0):
     self.event = event
     #(segstr, self.sign)=histseg.remove_signs_from_segstr(event.segstr)
     self.sign = 1
     if event.cnval < 0:
         self.sign = -1
     segstr = event.segstr
     self.cnval = event.cnval * self.sign
     self.isTrue = 0  # this will be 0 if edge is FP, 1 if TP, 2 if TN, -1 if FN, and 3 if it's a linear combination of true events.
     self.refindex = -1
     self.refpreval = -1
     self.reforder = -1
     self.avecost = -1
     if refhistoryid in event.histories:
         self.update_for_true_event(event, refhistoryid, histScores, totalp)
         if len(event.histories) > 1:
             self.isTrue = 1
         else:
             self.isTrue = -1
     else:
         self.isTrue = 0
         event.likelihood = histseg.compute_likelihood_histories(
             event.histories, histScores, totalp)
	def run(self):
		self.logToMaster("Setting up...") 
		opts=self.options
		histseg.Global_BINWIDTH=opts.binwidth
		sampleid=opts.sampleid
		outputdir=opts.outputdir
		subprocess.call("mkdir -p %s" % outputdir, shell=True)

		historystatsfile=os.path.join(outputdir, "historystats.txt")
		if not os.path.exists(historystatsfile): 
			self.logToMaster("Creating historystats.txt...%s" % historystatsfile) 
			logger.info("historystatsfile: %s" % historystatsfile)
			pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run()
		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores)
		logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH)
		logger.info("totalp is %s" % str(self.totalp))	

		pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts")
		if opts.pevnts or not os.path.exists(pevntsfile): 
			self.logToMaster("Creating pevntsfile...%s" % pevntsfile) 
			logger.info("pevntsfile: %s" % pevntsfile)
			self.addChildTarget(pevntsjobtree.CreatePevntsFile(pevntsfile, self.historyScores, self.totalp, opts))
		self.setFollowOnTarget(DoAnalysisOfMergedEvents(opts))
Exemple #15
0
	def update_for_true_event(self, event, refhistoryid, histScores, totalp):  
		refindex=event.histories.index(refhistoryid)
		# need to pop off the simulated history values and recompute the likelihood for the event
		if len(event.histories)>1: 
			event.histories.pop(refindex)
			refpreval=event.prevals.pop(refindex)
			reforder=event.orders.pop(refindex)
			event.likelihood = histseg.compute_likelihood_histories(event.histories, histScores, totalp)	
			event.numhists=len(event.histories)
			event.compute_timing_wmeansd(histScores)
			event.histories.insert(refindex, refhistoryid)	# reinsert these 
			event.prevals.insert(refindex, refpreval)	
			event.orders.insert(refindex, reforder)	
			upperc=event.uppercosts.pop(refindex)
			lowerc=event.lowercosts.pop(refindex)
			avecost=np.mean(np.array(event.uppercosts+event.lowercosts))
			event.uppercosts.insert(refindex, upperc)	
			event.lowercosts.insert(refindex, lowerc)
		else: 
			refpreval=event.prevals[refindex]
			reforder = event.orders[refindex]
			avecost=np.mean(np.array(event.uppercosts+event.lowercosts))
			event.likelihood=0
		(self.refpreval, self.reforder, self.avecost) = (refpreval, reforder, avecost)
	def run(self):
		self.logToMaster("Setting up...") 
		opts=self.options
		histseg.Global_BINWIDTH=opts.binwidth
		sampleid=opts.sampleid
		outputdir=opts.outputdir
	
		historystatsfile=os.path.join(outputdir, "historystats.txt")
		if not os.path.exists(historystatsfile): 
			self.logToMaster("Creating historystats.txt...%s" % historystatsfile) 
			logger.info("historystatsfile: %s" % historystatsfile)
			pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run()
		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores)
		#check that the *.pevnts file exists. 
		pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts")
		if not os.path.exists(pevntsfile): 
			sys.exit("The required %s file does not exist." % pevntsfile)
	
		pedgesfile=os.path.join(outputdir, sampleid + ".pedgs")
		if opts.pedges or not os.path.exists(pedgesfile):
			self.logToMaster("Creating pedgesfile...%s" % pedgesfile) 
			logger.info("pedgesfile: %s" % pedgesfile)
			CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run()
		
		seghistfile=os.path.join(outputdir, "seghists.txt")
		if opts.sgh or not os.path.exists(seghistfile): 
			self.logToMaster("Creating seghists file ... %s" % seghistfile)
			make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile)
			
		# label the seghists if an annotation file is given
		if opts.bedfile: 
			labeledfn=os.path.join(outputdir, "seghists.labeled")
			if not os.path.exists(labeledfn):
				pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn)
			geneordfn=os.path.join(outputdir, "geneords.txt")
			if opts.geneords or not os.path.exists(geneordfn):
				seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn)

		mrgpeventsfile=os.path.join(outputdir, sampleid + ".pmevnts")
		if not os.path.exists(mrgpeventsfile):
			self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) 
			logger.info("mrgpeventsfile: %s" % mrgpeventsfile)
			CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run()
		
		mrgpedgesfile=os.path.join(outputdir, sampleid + ".pmedgs")
		if not os.path.exists(mrgpedgesfile):
			self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) 
			logger.info("mrgpedgesfile: %s" % mrgpedgesfile)
			CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run()
	
		breaksfile=os.path.join(outputdir, "breakpoints.txt")
		if not os.path.exists(breaksfile): 
			self.logToMaster("Creating breaksfile...%s" % breaksfile) 
			breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID)
			breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID)
			breaksfh=open(breaksfile, 'w')
			for loc in sorted(breaklocs.keys()):
				(n, t) = breaklocs[loc]
				(n2, t2) = breaklocs2[loc]
				breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2))	
		
		# Creating links is no longer an option.  
		#linksfile =os.path.join(outputdir, sampleid +".links")
		#if opts.links and not os.path.exists(linksfile): 
		#	self.logToMaster("Creating linksfile...%s" % linksfile) 
		#	logger.info("linksfile: %s" % linksfile)
		#	self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp))		

		#Annotating Events is no longer an option. Seghists are annotated instead.
		#annotationfile=os.path.join(outputdir, "evnts.ann")
		#only create the annotations file here if we aren't doing gene ranking.  Otherwise the gene rank option will create the annotation file for itself.  
		#if opts.ann and not opts.generank and not os.path.exists(annotationsfile): 
		#	logger.info("annotationfile: %s" % annotationfile)
		#	if not self.events: 
		#		self.events=pickle.load(open(pevntsfile, 'rb'))
		#	self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile))
	
		# generank isn't an option - geneords is done instead using seghists. 		
		#generankfile=os.path.join(outputdir, "generanks.txt")
		# annotation file comes before generankfile (gene ranking depends on annotations.) 
		#if opts.generank and not os.path.exists(generankfile): 
		#	self.logToMaster("Creating generankfile: %s" % generankfile)
		#	logger.info("generankfile: %s" % generankfile)
		#	if not self.events: 
		#		self.events=pickle.load(open(pevntsfile, 'rb'))
		#		self.historyScores=np.loadtxt(historystatsfile, dtype=int)
		
		if opts.mcmcmix: 	
			self.logToMaster("Setting up MCMC analysis")
			mcmcdir=os.path.join(outputdir, "mcmcdata")
			mcmcdat=os.path.join(mcmcdir, "edge_counts.dat")
			mcmcdir=os.path.join(outputdir, "mcmcdata")
			mcmcdat=os.path.join(mcmcdir, "edge_counts.dat")
			if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat):
				subprocess.call("mkdir -p %s" % mcmcdir, shell=True)	
				opts.pevnts=pevntsfile 
				opts.pedges=pedgesfile
				self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir))

		if opts.simulation: 
			self.logToMaster("Setting up Simulation analysis")
			simoutput=os.path.join(outputdir, "events.stats")
			if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): 
				self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth))
			simoutput2=os.path.join(outputdir, "edges.stats")
			if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): 
				self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth))
			simoutput3=os.path.join(outputdir, "mrgedges.stats")
			if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): 
				self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
        default=histseg.Global_BINWIDTH,
        type=int)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood.'
    )
    add_score_edges_options(parser)
    args = parser.parse_args()
    histseg.Global_BINWIDTH = args.binwidth
    allevents = pickle.load(open(args.pevnts, 'rb'))
    historyScores = np.loadtxt(args.historystats, dtype=int)
    totalp = 0
    if args.totalp:
        totalp = args.totalp
    else:
        totalp = histseg.compute_likelihood_histories(historyScores[:, 0],
                                                      historyScores)
    alledges = score_edges_within_pevents(allevents, historyScores, totalp,
                                          args.prevalence_error,
                                          args.ignore_cn)
    if args.edges:
        outfile = open(args.edges, 'w')
        for edge in alledges:
            outfile.write(str(edge))
    if args.outpickle:
        pickle.dump(alledges, open(args.outpickle, 'wb'),
                    pickle.HIGHEST_PROTOCOL)
    def run(self):
        self.logToMaster("Setting up...")
        opts = self.options
        histseg.Global_BINWIDTH = opts.binwidth
        sampleid = opts.sampleid
        outputdir = opts.outputdir

        historystatsfile = os.path.join(outputdir, "historystats.txt")
        if not os.path.exists(historystatsfile):
            self.logToMaster("Creating historystats.txt...%s" %
                             historystatsfile)
            logger.info("historystatsfile: %s" % historystatsfile)
            pevntsjobtree.CombineHistoryStatsfiles(opts,
                                                   historystatsfile).run()
        self.historyScores = np.loadtxt(historystatsfile, dtype=int)
        self.totalp = histseg.compute_likelihood_histories(
            self.historyScores[:, 0], self.historyScores)
        #check that the *.pevnts file exists.
        pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts")
        if not os.path.exists(pevntsfile):
            sys.exit("The required %s file does not exist." % pevntsfile)

        pedgesfile = os.path.join(outputdir, sampleid + ".pedgs")
        if opts.pedges or not os.path.exists(pedgesfile):
            self.logToMaster("Creating pedgesfile...%s" % pedgesfile)
            logger.info("pedgesfile: %s" % pedgesfile)
            CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile,
                             self.historyScores, self.totalp, False).run()

        seghistfile = os.path.join(outputdir, "seghists.txt")
        if opts.sgh or not os.path.exists(seghistfile):
            self.logToMaster("Creating seghists file ... %s" % seghistfile)
            make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')),
                                          self.historyScores, seghistfile)

        # label the seghists if an annotation file is given
        if opts.bedfile:
            labeledfn = os.path.join(outputdir, "seghists.labeled")
            if not os.path.exists(labeledfn):
                pick_and_label_best_seghists.main(seghistfile, opts.bedfile,
                                                  True, labeledfn)
            geneordfn = os.path.join(outputdir, "geneords.txt")
            if opts.geneords or not os.path.exists(geneordfn):
                seghists_to_gene_orders.main(seghistfile, opts.bedfile,
                                             geneordfn)

        mrgpeventsfile = os.path.join(outputdir, sampleid + ".pmevnts")
        if not os.path.exists(mrgpeventsfile):
            self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile)
            logger.info("mrgpeventsfile: %s" % mrgpeventsfile)
            CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')),
                                   mrgpeventsfile, self.historyScores).run()

        mrgpedgesfile = os.path.join(outputdir, sampleid + ".pmedgs")
        if not os.path.exists(mrgpedgesfile):
            self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile)
            logger.info("mrgpedgesfile: %s" % mrgpedgesfile)
            CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')),
                                   mrgpedgesfile, self.historyScores).run()

        breaksfile = os.path.join(outputdir, "breakpoints.txt")
        if not os.path.exists(breaksfile):
            self.logToMaster("Creating breaksfile...%s" % breaksfile)
            breaklocs = histseg.get_breakpoints(
                pickle.load(open(pedgesfile, 'rb')), opts.trueID)
            breaklocs2 = histseg.get_breakpoints(
                pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID)
            breaksfh = open(breaksfile, 'w')
            for loc in sorted(breaklocs.keys()):
                (n, t) = breaklocs[loc]
                (n2, t2) = breaklocs2[loc]
                breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2))

        # Creating links is no longer an option.
        #linksfile =os.path.join(outputdir, sampleid +".links")
        #if opts.links and not os.path.exists(linksfile):
        #	self.logToMaster("Creating linksfile...%s" % linksfile)
        #	logger.info("linksfile: %s" % linksfile)
        #	self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp))

        #Annotating Events is no longer an option. Seghists are annotated instead.
        #annotationfile=os.path.join(outputdir, "evnts.ann")
        #only create the annotations file here if we aren't doing gene ranking.  Otherwise the gene rank option will create the annotation file for itself.
        #if opts.ann and not opts.generank and not os.path.exists(annotationsfile):
        #	logger.info("annotationfile: %s" % annotationfile)
        #	if not self.events:
        #		self.events=pickle.load(open(pevntsfile, 'rb'))
        #	self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile))

        # generank isn't an option - geneords is done instead using seghists.
        #generankfile=os.path.join(outputdir, "generanks.txt")
        # annotation file comes before generankfile (gene ranking depends on annotations.)
        #if opts.generank and not os.path.exists(generankfile):
        #	self.logToMaster("Creating generankfile: %s" % generankfile)
        #	logger.info("generankfile: %s" % generankfile)
        #	if not self.events:
        #		self.events=pickle.load(open(pevntsfile, 'rb'))
        #		self.historyScores=np.loadtxt(historystatsfile, dtype=int)

        if opts.mcmcmix:
            self.logToMaster("Setting up MCMC analysis")
            mcmcdir = os.path.join(outputdir, "mcmcdata")
            mcmcdat = os.path.join(mcmcdir, "edge_counts.dat")
            mcmcdir = os.path.join(outputdir, "mcmcdata")
            mcmcdat = os.path.join(mcmcdir, "edge_counts.dat")
            if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat):
                subprocess.call("mkdir -p %s" % mcmcdir, shell=True)
                opts.pevnts = pevntsfile
                opts.pedges = pedgesfile
                self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir))

        if opts.simulation:
            self.logToMaster("Setting up Simulation analysis")
            simoutput = os.path.join(outputdir, "events.stats")
            if ((not os.path.exists(simoutput))
                    or (os.path.getsize(simoutput) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores,
                                   "events", outputdir, opts.binwidth))
            simoutput2 = os.path.join(outputdir, "edges.stats")
            if ((not os.path.exists(simoutput2))
                    or (os.path.getsize(simoutput2) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores,
                                   "edges", outputdir, opts.binwidth))
            simoutput3 = os.path.join(outputdir, "mrgedges.stats")
            if ((not os.path.exists(simoutput3))
                    or (os.path.getsize(simoutput3) == 0)):
                self.addChildTarget(
                    SimAnalysisJob(mrgpedgesfile, opts.trueID,
                                   self.historyScores, "mrgedges", outputdir,
                                   opts.binwidth))
	parser.add_argument('pevnts', help='a .pevnts file.')
	parser.add_argument('historystats', help='The file with historystats')
	parser.add_argument('--outpickle', help='pickle the edges and write them to this file.')
	parser.add_argument('--edges', help='write edges to this file in text (not pickled).')
	parser.add_argument('--prevalence_error', help='the difference in prevalences to be considered the same.', type=float, default=0.05)
	parser.add_argument('--ignore_cn', help='merge together edges with different CN values.', default=False, action='store_true')
	parser.add_argument('--totalp', help='total probability of the histories', type=float)
	parser.add_argument('--binwidth', help='the multiplier between history ids of independent runs', default=histseg.Global_BINWIDTH, type=int)	

if __name__ == "__main__": 
	parser = argparse.ArgumentParser(description='Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood.')
	add_score_edges_options(parser)
	args=parser.parse_args()
	histseg.Global_BINWIDTH=args.binwidth
	allevents=pickle.load(open(args.pevnts, 'rb'))
	historyScores=np.loadtxt(args.historystats, dtype=int)
	totalp=0
	if args.totalp: 
		totalp=args.totalp
	else: 
		totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores) 
	alledges = score_edges_within_pevents(allevents, historyScores, totalp, args.prevalence_error, args.ignore_cn)
	if args.edges:
		outfile=open(args.edges, 'w') 
		for edge in alledges: 
			outfile.write(str(edge))
	if args.outpickle: 
		pickle.dump(alledges, open(args.outpickle, 'wb'), pickle.HIGHEST_PROTOCOL)


Exemple #20
0
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir):
#	do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt"))
	#do_order_correction.main(events, 0, historyScores, usepreval=True)
	do_order_correction.main(events, 0, histScores)
	#make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation 
	histScores[np.where(histScores[:,0] == refhistoryid),:]=0	 
	totalp=histseg.compute_likelihood_histories(histScores[:,0], histScores)
	sys.stderr.write("totalp is %f\n" % totalp)
	types=histseg.Global_EVENTTYPES 
	TP=np.zeros(len(types), dtype=int)
	FP=np.zeros(len(types), dtype=int)
	TN=np.zeros(len(types), dtype=int)
	FN=np.zeros(len(types), dtype=int)
	explained=np.zeros(len(types), dtype=int)
	FNesims=[]
	FPesims=[]
	myEdgeSimData=[] 
	for event in events:
		myedgesim=EdgeSimulationData(event, histScores, totalp, refhistoryid)
		etype=event.determineEventType()
		if myedgesim.isTrue==1: 
			TP[0]+=1
			TP[etype]+=1
		elif myedgesim.isTrue==-1: 
			FN[0]+=1
			FN[etype]+=1
			FNesims.append(myedgesim)
		elif myedgesim.isTrue==0:  
			FP[0]+=1
			FP[etype]+=1
			FPesims.append(myedgesim)
		myEdgeSimData.append(myedgesim)
	check_for_linear_decomp(FNesims, FPesims, explained)
	for i in xrange(len(TN)): 
		FN[i]=FN[i]-TN[i]	
	
	if datout_fh: 
		header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n"
		datout_fh.write(header)
		for edgesim in myEdgeSimData:
			datout_fh.write(str(edgesim))
	
	if stats_fh: 
		stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n")
		stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % 
			("\t".join(map(str, TP)), 
			"\t".join(map(str, FP)), 
			"\t".join(map(str, FN)), 
			"\t".join(map(str, TN)),
			"\t".join(map(str, explained)) ))
		tp=TP[0]+explained[0]
		fn=FN[0]-explained[0]
		f1score = float(2*tp)/float(2*tp+fn+FP[0])
		stats_fh.write("F1Score:\t%s\n" % (str(f1score)))
	
	if breaks_fh: 
		breakpoints=histseg.get_breakpoints(edges, refhistoryid)
		for loc in breakpoints.keys(): 
			(n, t) = breakpoints[loc]
			breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t))
		breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))