def score_and_link_cycles(args): if args.binwidth: histseg.Global_BINWIDTH = args.binwidth if not args.historystats and args.cnavg: historyScores = histseg.combine_history_statsfiles(args.cnavg) elif not os.path.isfile(args.historystats) and args.cnavg: historyScores = histseg.combine_history_statsfiles(args.cnavg) np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t') elif os.path.isfile(args.historystats): historyScores = np.loadtxt(args.historystats, dtype=int) if not historyScores: sys.exit("Need to use --historystats or --cnavg option") totalp = 0 if args.totalp: totalp = args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:, 0], historyScores) allevents = [] if args.cnavg: sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg)) allevents = histseg.get_events_from_cnavgdir(args.cnavg, historyScores, totalp) elif args.inpickle and os.path.isfile(args.inpickle): sys.stderr.write("using pickle file\n") allevents = pickle.load(open(args.inpickle, 'rb')) sys.stderr.write("there are %d events\n" % (len(allevents))) if args.outpickle: for event in allevents: event.trim() eventfile = open(args.outpickle, 'wb') pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL) if args.events: eventfile = open(args.events, 'w') for evnt in allevents: eventfile.write("%s" % (str(evnt))) # link the events... if args.links: if not allevents: sys.exit( "Need events to link! use --inpickle or --cnavg or --events") if not totalp: sys.exit("Need a --totalp or --cnavg or --historystats options") eventlinks = link_events_by_order_within_histories(allevents) linkfile = open(args.links, 'w') for link in eventlinks: link.likelihood = histseg.compute_likelihood_histories( link.histories, historyScores) linkfile.write("%s" % (str(link)))
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir): # do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt")) #do_order_correction.main(events, 0, historyScores, usepreval=True) do_order_correction.main(events, 0, histScores) #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation histScores[np.where(histScores[:, 0] == refhistoryid), :] = 0 totalp = histseg.compute_likelihood_histories(histScores[:, 0], histScores) sys.stderr.write("totalp is %f\n" % totalp) types = histseg.Global_EVENTTYPES TP = np.zeros(len(types), dtype=int) FP = np.zeros(len(types), dtype=int) TN = np.zeros(len(types), dtype=int) FN = np.zeros(len(types), dtype=int) explained = np.zeros(len(types), dtype=int) FNesims = [] FPesims = [] myEdgeSimData = [] for event in events: myedgesim = EdgeSimulationData(event, histScores, totalp, refhistoryid) etype = event.determineEventType() if myedgesim.isTrue == 1: TP[0] += 1 TP[etype] += 1 elif myedgesim.isTrue == -1: FN[0] += 1 FN[etype] += 1 FNesims.append(myedgesim) elif myedgesim.isTrue == 0: FP[0] += 1 FP[etype] += 1 FPesims.append(myedgesim) myEdgeSimData.append(myedgesim) check_for_linear_decomp(FNesims, FPesims, explained) for i in xrange(len(TN)): FN[i] = FN[i] - TN[i] if datout_fh: header = "event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: datout_fh.write(str(edgesim)) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map( str, FP)), "\t".join(map(str, FN)), "\t".join( map(str, TN)), "\t".join(map(str, explained)))) tp = TP[0] + explained[0] fn = FN[0] - explained[0] f1score = float(2 * tp) / float(2 * tp + fn + FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints = histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))
def run(self): self.logToMaster("Setting up...") opts = self.options histseg.Global_BINWIDTH = opts.binwidth sampleid = opts.sampleid outputdir = opts.outputdir subprocess.call("mkdir -p %s" % outputdir, shell=True) historystatsfile = os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores = np.loadtxt(historystatsfile, dtype=int) self.totalp = histseg.compute_likelihood_histories( self.historyScores[:, 0], self.historyScores) logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH) logger.info("totalp is %s" % str(self.totalp)) pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts") if opts.pevnts or not os.path.exists(pevntsfile): self.logToMaster("Creating pevntsfile...%s" % pevntsfile) logger.info("pevntsfile: %s" % pevntsfile) self.addChildTarget( pevntsjobtree.CreatePevntsFile(pevntsfile, self.historyScores, self.totalp, opts)) self.setFollowOnTarget(DoAnalysisOfMergedEvents(opts))
def score_edges_within_pevents(allevents, historyScores, totalp, prev_error=0.05, ignore_cn=True): prevalence_error = prev_error sys.stderr.write("number of events: %d\n" % (len(allevents))) sys.stderr.write("ignore_cn: %s\n" % ignore_cn) alledges = [] for event in allevents: event.unpack() for seg in event.segs: edge = copy.deepcopy(event) edge.segs = [seg] edge.make_segstr() alledges.append(edge) if ignore_cn: unique_edges = unique_loc_edges(alledges) else: unique_edges = histseg.unique_c_events_list(alledges) sys.stderr.write("totalp: %s\n" % (str(totalp))) for edge in unique_edges: edge.update(historyScores) edge.likelihood = histseg.compute_likelihood_histories( edge.histories, historyScores, totalp) edge.trim() return unique_edges
def update_for_true_event(self, event, refhistoryid, histScores, totalp): refindex = event.histories.index(refhistoryid) # need to pop off the simulated history values and recompute the likelihood for the event if len(event.histories) > 1: event.histories.pop(refindex) refpreval = event.prevals.pop(refindex) reforder = event.orders.pop(refindex) event.likelihood = histseg.compute_likelihood_histories( event.histories, histScores, totalp) event.numhists = len(event.histories) event.compute_timing_wmeansd(histScores) event.histories.insert(refindex, refhistoryid) # reinsert these event.prevals.insert(refindex, refpreval) event.orders.insert(refindex, reforder) upperc = event.uppercosts.pop(refindex) lowerc = event.lowercosts.pop(refindex) avecost = np.mean(np.array(event.uppercosts + event.lowercosts)) event.uppercosts.insert(refindex, upperc) event.lowercosts.insert(refindex, lowerc) else: refpreval = event.prevals[refindex] reforder = event.orders[refindex] avecost = np.mean(np.array(event.uppercosts + event.lowercosts)) event.likelihood = 0 (self.refpreval, self.reforder, self.avecost) = (refpreval, reforder, avecost)
def score_and_link_cycles(args): if args.binwidth: histseg.Global_BINWIDTH=args.binwidth if not args.historystats and args.cnavg: historyScores=histseg.combine_history_statsfiles(args.cnavg) elif not os.path.isfile(args.historystats) and args.cnavg: historyScores=histseg.combine_history_statsfiles(args.cnavg) np.savetxt(args.historystats, historyScores, fmt="%d", delimiter='\t') elif os.path.isfile(args.historystats): historyScores=np.loadtxt(args.historystats, dtype=int) if not historyScores: sys.exit("Need to use --historystats or --cnavg option") totalp=0 if args.totalp: totalp=args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores) allevents=[] if args.cnavg: sys.stderr.write("using cnavg dir: %s\n" % (args.cnavg)) allevents=histseg.get_events_from_cnavgdir(args.cnavg, historyScores, totalp) elif args.inpickle and os.path.isfile(args.inpickle): sys.stderr.write("using pickle file\n") allevents=pickle.load(open(args.inpickle, 'rb')) sys.stderr.write("there are %d events\n" % (len(allevents))) if args.outpickle: for event in allevents: event.trim() eventfile= open(args.outpickle, 'wb') pickle.dump(allevents, eventfile, pickle.HIGHEST_PROTOCOL) if args.events: eventfile=open(args.events, 'w') for evnt in allevents: eventfile.write("%s" % (str(evnt))) # link the events... if args.links: if not allevents: sys.exit("Need events to link! use --inpickle or --cnavg or --events") if not totalp: sys.exit("Need a --totalp or --cnavg or --historystats options") eventlinks = link_events_by_order_within_histories(allevents) linkfile=open(args.links, 'w') for link in eventlinks: link.likelihood=histseg.compute_likelihood_histories(link.histories, historyScores) linkfile.write("%s" % (str(link)))
def run(self): self.logToMaster("Merging .braney files...") opts=self.options outputdir=self.outputdir #global_dir=self.getGlobalTempDir() global_dir=self.outputdir cmd = "mkdir -p %s" % global_dir logger.info(cmd) subprocess.call(cmd, shell=True) # get historystats historystatsfile=os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) CombineHistoryStatsfiles(opts, historystatsfile).run() historyScores=np.loadtxt(historystatsfile, dtype=int) totalp=histseg.compute_likelihood_histories(historyScores[:,0], historyScores) pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") self.addChildTarget(CreatePevntsFile(pevntsfile, historyScores, totalp, opts))
def create_CNprofiles_from_Edges(self, histScores, totalp=0): edgelist = self.esegs mycnprofs = [] if totalp == 0: totalp = ecycles.compute_totalp(historyScores) (hprofiles, pprofiles) = create_profile_matrices(edgelist, histScores) (ridx, profiles) = get_unique_rows(hprofiles) goodi = np.where(np.sum(profiles != 0, axis=1) > 0)[0] for i in goodi: mycnp = CNprofile() mycnp.likelihood = ecycles.compute_likelihood_histories( histScores[ridx == i, 0], histScores, totalp) mycnp.numhists = np.sum(ridx == i) cnvals = profiles[i, :] pvals = pprofiles[ridx == i, :] pvalsd = np.std(pvals, axis=0) pvalsm = np.mean(pvals, axis=0) mycnp.pvals = pvalsm[cnvals != 0].tolist() mycnp.pvalsd = pvalsd[cnvals != 0].tolist() mycnp.cnvals = cnvals[cnvals != 0].tolist() mycnprofs.append(mycnp) self.CNprofiles = mycnprofs
def create_CNprofiles_from_Edges(self, histScores, totalp=0): edgelist = self.esegs mycnprofs = [] if totalp == 0: totalp = ecycles.compute_totalp(historyScores) (hprofiles, pprofiles) = create_profile_matrices(edgelist, histScores) (ridx, profiles) = get_unique_rows(hprofiles) goodi = np.where(np.sum(profiles != 0, axis=1) > 0)[0] for i in goodi: mycnp = CNprofile() mycnp.likelihood = ecycles.compute_likelihood_histories(histScores[ridx == i, 0], histScores, totalp) mycnp.numhists = np.sum(ridx == i) cnvals = profiles[i, :] pvals = pprofiles[ridx == i, :] pvalsd = np.std(pvals, axis=0) pvalsm = np.mean(pvals, axis=0) mycnp.pvals = pvalsm[cnvals != 0].tolist() mycnp.pvalsd = pvalsd[cnvals != 0].tolist() mycnp.cnvals = cnvals[cnvals != 0].tolist() mycnprofs.append(mycnp) self.CNprofiles = mycnprofs
def run(self): self.logToMaster("Merging .braney files...") opts = self.options outputdir = self.outputdir #global_dir=self.getGlobalTempDir() global_dir = self.outputdir cmd = "mkdir -p %s" % global_dir logger.info(cmd) subprocess.call(cmd, shell=True) # get historystats historystatsfile = os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) CombineHistoryStatsfiles(opts, historystatsfile).run() historyScores = np.loadtxt(historystatsfile, dtype=int) totalp = histseg.compute_likelihood_histories(historyScores[:, 0], historyScores) pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts") self.addChildTarget( CreatePevntsFile(pevntsfile, historyScores, totalp, opts))
def score_edges_within_pevents(allevents, historyScores, totalp, prev_error=0.05, ignore_cn=True): prevalence_error=prev_error sys.stderr.write("number of events: %d\n" % (len(allevents))) sys.stderr.write("ignore_cn: %s\n" % ignore_cn) alledges=[] for event in allevents: event.unpack() for seg in event.segs: edge=copy.deepcopy(event) edge.segs=[seg] edge.make_segstr() alledges.append(edge) if ignore_cn: unique_edges=unique_loc_edges(alledges) else: unique_edges=histseg.unique_c_events_list(alledges) sys.stderr.write("totalp: %s\n" % (str(totalp))) for edge in unique_edges: edge.update(historyScores) edge.likelihood=histseg.compute_likelihood_histories(edge.histories, historyScores, totalp) edge.trim() return unique_edges
def __init__(self, event, histScores, totalp, refhistoryid=0): self.event=event #(segstr, self.sign)=histseg.remove_signs_from_segstr(event.segstr) self.sign=1 if event.cnval <0: self.sign=-1 segstr=event.segstr self.cnval=event.cnval*self.sign self.isTrue=0 # this will be 0 if edge is FP, 1 if TP, 2 if TN, -1 if FN, and 3 if it's a linear combination of true events. self.refindex=-1 self.refpreval=-1 self.reforder=-1 self.avecost=-1 if refhistoryid in event.histories: self.update_for_true_event(event, refhistoryid, histScores, totalp) if len(event.histories)>1: self.isTrue=1 else: self.isTrue=-1 else: self.isTrue=0 event.likelihood=histseg.compute_likelihood_histories(event.histories, histScores, totalp)
def __init__(self, event, histScores, totalp, refhistoryid=0): self.event = event #(segstr, self.sign)=histseg.remove_signs_from_segstr(event.segstr) self.sign = 1 if event.cnval < 0: self.sign = -1 segstr = event.segstr self.cnval = event.cnval * self.sign self.isTrue = 0 # this will be 0 if edge is FP, 1 if TP, 2 if TN, -1 if FN, and 3 if it's a linear combination of true events. self.refindex = -1 self.refpreval = -1 self.reforder = -1 self.avecost = -1 if refhistoryid in event.histories: self.update_for_true_event(event, refhistoryid, histScores, totalp) if len(event.histories) > 1: self.isTrue = 1 else: self.isTrue = -1 else: self.isTrue = 0 event.likelihood = histseg.compute_likelihood_histories( event.histories, histScores, totalp)
def run(self): self.logToMaster("Setting up...") opts=self.options histseg.Global_BINWIDTH=opts.binwidth sampleid=opts.sampleid outputdir=opts.outputdir subprocess.call("mkdir -p %s" % outputdir, shell=True) historystatsfile=os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores=np.loadtxt(historystatsfile, dtype=int) self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores) logger.info("Global_BINWIDTH: %d" % histseg.Global_BINWIDTH) logger.info("totalp is %s" % str(self.totalp)) pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") if opts.pevnts or not os.path.exists(pevntsfile): self.logToMaster("Creating pevntsfile...%s" % pevntsfile) logger.info("pevntsfile: %s" % pevntsfile) self.addChildTarget(pevntsjobtree.CreatePevntsFile(pevntsfile, self.historyScores, self.totalp, opts)) self.setFollowOnTarget(DoAnalysisOfMergedEvents(opts))
def update_for_true_event(self, event, refhistoryid, histScores, totalp): refindex=event.histories.index(refhistoryid) # need to pop off the simulated history values and recompute the likelihood for the event if len(event.histories)>1: event.histories.pop(refindex) refpreval=event.prevals.pop(refindex) reforder=event.orders.pop(refindex) event.likelihood = histseg.compute_likelihood_histories(event.histories, histScores, totalp) event.numhists=len(event.histories) event.compute_timing_wmeansd(histScores) event.histories.insert(refindex, refhistoryid) # reinsert these event.prevals.insert(refindex, refpreval) event.orders.insert(refindex, reforder) upperc=event.uppercosts.pop(refindex) lowerc=event.lowercosts.pop(refindex) avecost=np.mean(np.array(event.uppercosts+event.lowercosts)) event.uppercosts.insert(refindex, upperc) event.lowercosts.insert(refindex, lowerc) else: refpreval=event.prevals[refindex] reforder = event.orders[refindex] avecost=np.mean(np.array(event.uppercosts+event.lowercosts)) event.likelihood=0 (self.refpreval, self.reforder, self.avecost) = (refpreval, reforder, avecost)
def run(self): self.logToMaster("Setting up...") opts=self.options histseg.Global_BINWIDTH=opts.binwidth sampleid=opts.sampleid outputdir=opts.outputdir historystatsfile=os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores=np.loadtxt(historystatsfile, dtype=int) self.totalp=histseg.compute_likelihood_histories(self.historyScores[:,0], self.historyScores) #check that the *.pevnts file exists. pevntsfile=os.path.join(outputdir, opts.sampleid + ".pevnts") if not os.path.exists(pevntsfile): sys.exit("The required %s file does not exist." % pevntsfile) pedgesfile=os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): self.logToMaster("Creating pedgesfile...%s" % pedgesfile) logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run() seghistfile=os.path.join(outputdir, "seghists.txt") if opts.sgh or not os.path.exists(seghistfile): self.logToMaster("Creating seghists file ... %s" % seghistfile) make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile) # label the seghists if an annotation file is given if opts.bedfile: labeledfn=os.path.join(outputdir, "seghists.labeled") if not os.path.exists(labeledfn): pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn) geneordfn=os.path.join(outputdir, "geneords.txt") if opts.geneords or not os.path.exists(geneordfn): seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn) mrgpeventsfile=os.path.join(outputdir, sampleid + ".pmevnts") if not os.path.exists(mrgpeventsfile): self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) logger.info("mrgpeventsfile: %s" % mrgpeventsfile) CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run() mrgpedgesfile=os.path.join(outputdir, sampleid + ".pmedgs") if not os.path.exists(mrgpedgesfile): self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run() breaksfile=os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): self.logToMaster("Creating breaksfile...%s" % breaksfile) breaklocs=histseg.get_breakpoints(pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2=histseg.get_breakpoints(pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh=open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) # Creating links is no longer an option. #linksfile =os.path.join(outputdir, sampleid +".links") #if opts.links and not os.path.exists(linksfile): # self.logToMaster("Creating linksfile...%s" % linksfile) # logger.info("linksfile: %s" % linksfile) # self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) #Annotating Events is no longer an option. Seghists are annotated instead. #annotationfile=os.path.join(outputdir, "evnts.ann") #only create the annotations file here if we aren't doing gene ranking. Otherwise the gene rank option will create the annotation file for itself. #if opts.ann and not opts.generank and not os.path.exists(annotationsfile): # logger.info("annotationfile: %s" % annotationfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) # generank isn't an option - geneords is done instead using seghists. #generankfile=os.path.join(outputdir, "generanks.txt") # annotation file comes before generankfile (gene ranking depends on annotations.) #if opts.generank and not os.path.exists(generankfile): # self.logToMaster("Creating generankfile: %s" % generankfile) # logger.info("generankfile: %s" % generankfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.historyScores=np.loadtxt(historystatsfile, dtype=int) if opts.mcmcmix: self.logToMaster("Setting up MCMC analysis") mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") mcmcdir=os.path.join(outputdir, "mcmcdata") mcmcdat=os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts=pevntsfile opts.pedges=pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: self.logToMaster("Setting up Simulation analysis") simoutput=os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget(SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2=os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget(SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3=os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget(SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
default=histseg.Global_BINWIDTH, type=int) if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood.' ) add_score_edges_options(parser) args = parser.parse_args() histseg.Global_BINWIDTH = args.binwidth allevents = pickle.load(open(args.pevnts, 'rb')) historyScores = np.loadtxt(args.historystats, dtype=int) totalp = 0 if args.totalp: totalp = args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:, 0], historyScores) alledges = score_edges_within_pevents(allevents, historyScores, totalp, args.prevalence_error, args.ignore_cn) if args.edges: outfile = open(args.edges, 'w') for edge in alledges: outfile.write(str(edge)) if args.outpickle: pickle.dump(alledges, open(args.outpickle, 'wb'), pickle.HIGHEST_PROTOCOL)
def run(self): self.logToMaster("Setting up...") opts = self.options histseg.Global_BINWIDTH = opts.binwidth sampleid = opts.sampleid outputdir = opts.outputdir historystatsfile = os.path.join(outputdir, "historystats.txt") if not os.path.exists(historystatsfile): self.logToMaster("Creating historystats.txt...%s" % historystatsfile) logger.info("historystatsfile: %s" % historystatsfile) pevntsjobtree.CombineHistoryStatsfiles(opts, historystatsfile).run() self.historyScores = np.loadtxt(historystatsfile, dtype=int) self.totalp = histseg.compute_likelihood_histories( self.historyScores[:, 0], self.historyScores) #check that the *.pevnts file exists. pevntsfile = os.path.join(outputdir, opts.sampleid + ".pevnts") if not os.path.exists(pevntsfile): sys.exit("The required %s file does not exist." % pevntsfile) pedgesfile = os.path.join(outputdir, sampleid + ".pedgs") if opts.pedges or not os.path.exists(pedgesfile): self.logToMaster("Creating pedgesfile...%s" % pedgesfile) logger.info("pedgesfile: %s" % pedgesfile) CreatePedgesFile(pickle.load(open(pevntsfile, 'rb')), pedgesfile, self.historyScores, self.totalp, False).run() seghistfile = os.path.join(outputdir, "seghists.txt") if opts.sgh or not os.path.exists(seghistfile): self.logToMaster("Creating seghists file ... %s" % seghistfile) make_seghists_from_edges.main(pickle.load(open(pedgesfile, 'rb')), self.historyScores, seghistfile) # label the seghists if an annotation file is given if opts.bedfile: labeledfn = os.path.join(outputdir, "seghists.labeled") if not os.path.exists(labeledfn): pick_and_label_best_seghists.main(seghistfile, opts.bedfile, True, labeledfn) geneordfn = os.path.join(outputdir, "geneords.txt") if opts.geneords or not os.path.exists(geneordfn): seghists_to_gene_orders.main(seghistfile, opts.bedfile, geneordfn) mrgpeventsfile = os.path.join(outputdir, sampleid + ".pmevnts") if not os.path.exists(mrgpeventsfile): self.logToMaster("Creating mpevnts...%s" % mrgpeventsfile) logger.info("mrgpeventsfile: %s" % mrgpeventsfile) CreateMergedEventsFile(pickle.load(open(pevntsfile, 'rb')), mrgpeventsfile, self.historyScores).run() mrgpedgesfile = os.path.join(outputdir, sampleid + ".pmedgs") if not os.path.exists(mrgpedgesfile): self.logToMaster("Creating mrgpegesfile...%s" % mrgpedgesfile) logger.info("mrgpedgesfile: %s" % mrgpedgesfile) CreateMergedEventsFile(pickle.load(open(pedgesfile, 'rb')), mrgpedgesfile, self.historyScores).run() breaksfile = os.path.join(outputdir, "breakpoints.txt") if not os.path.exists(breaksfile): self.logToMaster("Creating breaksfile...%s" % breaksfile) breaklocs = histseg.get_breakpoints( pickle.load(open(pedgesfile, 'rb')), opts.trueID) breaklocs2 = histseg.get_breakpoints( pickle.load(open(mrgpedgesfile, 'rb')), opts.trueID) breaksfh = open(breaksfile, 'w') for loc in sorted(breaklocs.keys()): (n, t) = breaklocs[loc] (n2, t2) = breaklocs2[loc] breaksfh.write("%s\t%d\t%d\t%d\t%d\n" % (loc, n, t, n2, t2)) # Creating links is no longer an option. #linksfile =os.path.join(outputdir, sampleid +".links") #if opts.links and not os.path.exists(linksfile): # self.logToMaster("Creating linksfile...%s" % linksfile) # logger.info("linksfile: %s" % linksfile) # self.addChildTarget(CreateLinksFile(pevntsfile, linksfile, self.totalp)) #Annotating Events is no longer an option. Seghists are annotated instead. #annotationfile=os.path.join(outputdir, "evnts.ann") #only create the annotations file here if we aren't doing gene ranking. Otherwise the gene rank option will create the annotation file for itself. #if opts.ann and not opts.generank and not os.path.exists(annotationsfile): # logger.info("annotationfile: %s" % annotationfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.addChildTarget(CreateAnnotationFile(self.events, opts.tabixfile, annotationfile)) # generank isn't an option - geneords is done instead using seghists. #generankfile=os.path.join(outputdir, "generanks.txt") # annotation file comes before generankfile (gene ranking depends on annotations.) #if opts.generank and not os.path.exists(generankfile): # self.logToMaster("Creating generankfile: %s" % generankfile) # logger.info("generankfile: %s" % generankfile) # if not self.events: # self.events=pickle.load(open(pevntsfile, 'rb')) # self.historyScores=np.loadtxt(historystatsfile, dtype=int) if opts.mcmcmix: self.logToMaster("Setting up MCMC analysis") mcmcdir = os.path.join(outputdir, "mcmcdata") mcmcdat = os.path.join(mcmcdir, "edge_counts.dat") mcmcdir = os.path.join(outputdir, "mcmcdata") mcmcdat = os.path.join(mcmcdir, "edge_counts.dat") if not os.path.exists(mcmcdir) or not os.path.exists(mcmcdat): subprocess.call("mkdir -p %s" % mcmcdir, shell=True) opts.pevnts = pevntsfile opts.pedges = pedgesfile self.addChildTarget(mcmcjobtree.SetupMCMC(opts, mcmcdir)) if opts.simulation: self.logToMaster("Setting up Simulation analysis") simoutput = os.path.join(outputdir, "events.stats") if ((not os.path.exists(simoutput)) or (os.path.getsize(simoutput) == 0)): self.addChildTarget( SimAnalysisJob(pevntsfile, opts.trueID, self.historyScores, "events", outputdir, opts.binwidth)) simoutput2 = os.path.join(outputdir, "edges.stats") if ((not os.path.exists(simoutput2)) or (os.path.getsize(simoutput2) == 0)): self.addChildTarget( SimAnalysisJob(pedgesfile, opts.trueID, self.historyScores, "edges", outputdir, opts.binwidth)) simoutput3 = os.path.join(outputdir, "mrgedges.stats") if ((not os.path.exists(simoutput3)) or (os.path.getsize(simoutput3) == 0)): self.addChildTarget( SimAnalysisJob(mrgpedgesfile, opts.trueID, self.historyScores, "mrgedges", outputdir, opts.binwidth))
parser.add_argument('pevnts', help='a .pevnts file.') parser.add_argument('historystats', help='The file with historystats') parser.add_argument('--outpickle', help='pickle the edges and write them to this file.') parser.add_argument('--edges', help='write edges to this file in text (not pickled).') parser.add_argument('--prevalence_error', help='the difference in prevalences to be considered the same.', type=float, default=0.05) parser.add_argument('--ignore_cn', help='merge together edges with different CN values.', default=False, action='store_true') parser.add_argument('--totalp', help='total probability of the histories', type=float) parser.add_argument('--binwidth', help='the multiplier between history ids of independent runs', default=histseg.Global_BINWIDTH, type=int) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Given an .pevnts file, it will split events into edges, combine equivalent edges (segments or adjacencies), and score them by likelihood.') add_score_edges_options(parser) args=parser.parse_args() histseg.Global_BINWIDTH=args.binwidth allevents=pickle.load(open(args.pevnts, 'rb')) historyScores=np.loadtxt(args.historystats, dtype=int) totalp=0 if args.totalp: totalp=args.totalp else: totalp = histseg.compute_likelihood_histories(historyScores[:,0], historyScores) alledges = score_edges_within_pevents(allevents, historyScores, totalp, args.prevalence_error, args.ignore_cn) if args.edges: outfile=open(args.edges, 'w') for edge in alledges: outfile.write(str(edge)) if args.outpickle: pickle.dump(alledges, open(args.outpickle, 'wb'), pickle.HIGHEST_PROTOCOL)
def analyze_simulation(events, refhistoryid, histScores, datout_fh, stats_fh, breaks_fh, outdir): # do_order_correction.main(events, 0, histScores, statsout=os.path.join(outdir, "historystats.txt")) #do_order_correction.main(events, 0, historyScores, usepreval=True) do_order_correction.main(events, 0, histScores) #make the cost of the refhistoryid 0 so that is doesn't get included in the likelihood calculation histScores[np.where(histScores[:,0] == refhistoryid),:]=0 totalp=histseg.compute_likelihood_histories(histScores[:,0], histScores) sys.stderr.write("totalp is %f\n" % totalp) types=histseg.Global_EVENTTYPES TP=np.zeros(len(types), dtype=int) FP=np.zeros(len(types), dtype=int) TN=np.zeros(len(types), dtype=int) FN=np.zeros(len(types), dtype=int) explained=np.zeros(len(types), dtype=int) FNesims=[] FPesims=[] myEdgeSimData=[] for event in events: myedgesim=EdgeSimulationData(event, histScores, totalp, refhistoryid) etype=event.determineEventType() if myedgesim.isTrue==1: TP[0]+=1 TP[etype]+=1 elif myedgesim.isTrue==-1: FN[0]+=1 FN[etype]+=1 FNesims.append(myedgesim) elif myedgesim.isTrue==0: FP[0]+=1 FP[etype]+=1 FPesims.append(myedgesim) myEdgeSimData.append(myedgesim) check_for_linear_decomp(FNesims, FPesims, explained) for i in xrange(len(TN)): FN[i]=FN[i]-TN[i] if datout_fh: header="event_id\tevent_type\tavecost\tLscore\tCNval\ttrue\tlength\tprevals\torders\tnumhists\n" datout_fh.write(header) for edgesim in myEdgeSimData: datout_fh.write(str(edgesim)) if stats_fh: stats_fh.write("type\ttotal\tAmp\tDel\tAdj\n") stats_fh.write("TP\t%s\nFP\t%s\nFN\t%s\nTN\t%s\nEX\t%s\n" % ("\t".join(map(str, TP)), "\t".join(map(str, FP)), "\t".join(map(str, FN)), "\t".join(map(str, TN)), "\t".join(map(str, explained)) )) tp=TP[0]+explained[0] fn=FN[0]-explained[0] f1score = float(2*tp)/float(2*tp+fn+FP[0]) stats_fh.write("F1Score:\t%s\n" % (str(f1score))) if breaks_fh: breakpoints=histseg.get_breakpoints(edges, refhistoryid) for loc in breakpoints.keys(): (n, t) = breakpoints[loc] breaks_fh.write("%s\t%d\t%d\n" % (loc, n, t)) breaks_fh.write("Breakpoints: %d\n" % len(breakpoints))