def plotCValues(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', c1_g='',model_g='mlp',true_dist=False,vars_g=None, workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', use_log=False): if use_log == True: post = 'log' else: post = '' n_hist_c = 200 keys = ['true','dec'] c1_values = dict((key,np.zeros(n_hist_c)) for key in keys) c2_values = dict((key,np.zeros(n_hist_c)) for key in keys) c1_2 = np.loadtxt('{0}/fitting_values_c1c2{1}.txt'.format(dir,post)) c1_values['true'] = c1_2[:,0] c1_values['dec'] = c1_2[:,1] c2_values['true'] = c1_2[:,2] c2_values['dec'] = c1_2[:,3] saveFig([],[c1_values['true'],c1_values['dec']], makePlotName('c1c2','train',type='c1_hist{0}'.format(post)),hist=True, axis=['signal weight'],marker=True,marker_value=c1[0], labels=['true','composed'],x_range=[0.,0.2],dir=dir, model_g=model_g,title='Histogram for estimated values signal weight',print_pdf=True) saveFig([],[c2_values['true'],c2_values['dec']], makePlotName('c1c2','train',type='c2_hist{0}'.format(post)),hist=True, axis=['bkg. weight'],marker=True,marker_value=c1[1], labels=['true','composed'],x_range=[0.1,0.4],dir=dir, model_g=model_g,title='Histogram for estimated values bkg. weight',print_pdf=True)
def drawFigure1(): """Draws Figure 1 (teaser diagram in the introduction).""" fig = plt.figure(figsize=(9, 4)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) sns.barplot(ax=ax1, x="footprint [GiB]", y="cs", order=["Uncompr", "ActualBestBaseMem", "ActualBestMem"], data=dfMemMorphStore.query("query == 'avg'")) sns.barplot(ax=ax2, x="runtime [s]", y="cs", order=["Uncompr", "ActualBestBasePerf", "ActualBestPerf"], data=dfPerfMorphStore.query("query == 'avg'")) for ax in [ax1, ax2]: ax.set_yticklabels([ "No\ncompression\nat all", "Established\nbase data\ncompression", "Our novel\ncontinuous\ncompression" ]) ax.set_ylabel(None) ax2.set_yticks([]) fig.tight_layout() sns.despine() utils.saveFig("figure01_teaser")
def drawFigure9(): """Draws Figure 9 (comparision of MorphStore and MonetDB).""" dfs = [] if useMorphStore: df = dfPerfMorphStore.query( "(cs in ['ActualBestPerf', 'Uncompr', 'UncomprScalar', 'ActualBestBasePerf'])" .format(scaleFactor))[["query", "ps", "cs", "runtime [s]"]].copy() df["candidate"] = df.apply( lambda row: "MorphStore {} {}".format(row["ps"], row["cs"]), axis=1) dfs.append(df) if useMonetDB: for intType in intTypesMonetDB: df = dfPerfMonetDB[intType] df["candidate"] = "MonetDB scalar {}".format(intType) dfs.append(df) dfComp = pd.concat(dfs) if useMorphStore: colors = [colorYellow, colorOrange, colorRed] order = [ "MorphStore scalar UncomprScalar", "MorphStore {} Uncompr".format(psNames[processingStyle]), "MorphStore {} ActualBestPerf".format(psNames[processingStyle]), ] labels = [ "MorphStore\nscalar\nuncompr.", "MorphStore\n{}\nuncompr.".format(psNames[processingStyle]), "MorphStore\n{}\ncontinuous compr.".format( psNames[processingStyle]), ] else: colors = [] order = [] labels = [] if useMonetDB: colors = [colorCyan, *colors, colorBlue] order = [ "MonetDB scalar BIGINT", *order, "MonetDB scalar tight", ] labels = [ "MonetDB\nscalar\nuncompr.", *labels, "MonetDB\nscalar\nnarrow types", ] filename = "figure09_morphstore_vs_monetdb" _drawDia("candidate", order, colors, None, dfComp, 3.09) ax = plt.gca() ax.set_title(ax.get_title()[4:]) # remove the letter "(a)" in the title utils.saveFig(filename) utils.drawLegendRect(labels, colors) utils.saveFig(filename + "_legend")
def drawFigure7(): """Draws Figure 7 (impact of the format combination).""" colors = [colorRed, colorGray, colorBlue, colorGreen] order = ["ActualWorst{}", "Uncompr", "StaticBP32", "ActualBest{}"] labels = [ "worst combination", "uncompressed", "Static-BP-32", "best combination" ] filename = "figure07_ssb_formats" _drawDia("cs", order, colors, dfMemMorphStore, dfPerfMorphStore) utils.saveFig(filename) utils.drawLegendRect(labels, colors) utils.saveFig(filename + "_legend")
def drawFigure10(): """Draws Figure 10 (fitness of our cost-based format selection).""" colors = [colorRed, colorGray, colorYellow, colorGreen] order = ["ActualWorst{}", "Uncompr", "CostBasedBest{}", "ActualBest{}"] labels = [ "worst combination", "uncompressed", "cost-based", "best combination" ] filename = "figure10_opt" _drawDia("cs", order, colors, dfMemMorphStore, dfPerfMorphStore) utils.saveFig(filename) utils.drawLegendRect(labels, colors) utils.saveFig(filename + "_legend")
def drawFigure8(): """Draws Figure 8 (compression of base data vs. intermediates).""" colors = [colorGray, colorCyan, colorYellow] order = ["Uncompr", "ActualBestBase{}", "ActualBest{}"] labels = [ "uncompressed", "+ compressed base columns", "+ compressed intermediates" ] filename = "figure08_ssb_base_vs_interm" _drawDia("cs", order, colors, dfMemMorphStore, dfPerfMorphStore) utils.saveFig(filename) utils.drawLegendRect(labels, colors) utils.saveFig(filename + "_legend")
def drawFigure5(dfMea): """ Draws Figure 5 (experiment on a single on-the-fly de/re-compression operator) """ # Create the main figure. fig = plt.figure(figsize=(10, 4)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) # Plot the data. for diaIdx, (ax, sel) in enumerate([ (ax1, 0.01), (ax2, 0.9), ]): sns.swarmplot(ax=ax, y="runtime [ms]", x="col", hue="class", hue_order=["alluncompr", "outuncompr", "outcompr"], palette=["red", "blue", "silver"], data=dfMea.query("sel == {}".format(sel))) ax.set_title("({}) {:.0%} selectivity".format(chr(ord("a") + diaIdx), sel)) ax.set_xlabel("input column") ax.set_ylim(bottom=0) ax.get_legend().remove() # Some post-processing. ax2.set_ylabel(None) sns.despine() fig.tight_layout() filename = "figure5_singleop" # Save the main figure. utils.saveFig(filename) utils.drawLegendMarker([ "uncompressed", "only input compressed", "input and output compressed" ], ["red", "blue", "silver"]) utils.saveFig(filename + "_legend")
def plotCValues(c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', c1_g='',model_g='mlp',true_dist=False,vars_g=None, workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', use_log=False, n_hist=150,c_eval=0, range_min=-1.0,range_max=0.): if use_log == True: post = 'log' else: post = '' keys = ['true','dec'] c1_ = dict((key,np.zeros(n_hist)) for key in keys) c1_values = dict((key,np.zeros(n_hist)) for key in keys) c2_values = dict((key,np.zeros(n_hist)) for key in keys) c1_1 = np.loadtxt('{0}/fitting_values_c1.txt'.format(dir)) c1_['true'] = c1_1[:,0] c1_['dec'] = c1_1[:,1] if true_dist == True: vals = [c1_['true'],c1_['dec']] labels = ['true','dec'] else: vals = c1_['dec'] vals1 = c1_1[:,3] labels = ['dec'] #vals = vals[vals <> 0.5] #vals = vals[vals <> 1.4] #vals1 = vals1[vals1 <> 1.1] #vals1 = vals1[vals1 <> 1.7] size = min(vals.shape[0],vals1.shape[0]) #saveFig([],[vals1], # makePlotName('g2','train',type='hist_g1g2'),hist=True, # axis=['g2'],marker=True,marker_value=c1[c_eval], # labels=labels,x_range=[range_min,range_max],dir=dir, # model_g=model_g,title='Histogram for fitted g2', print_pdf=True) saveFig([],[vals,vals1], makePlotName('g1g2','train',type='hist'),hist=True,hist2D=True, axis=['g1','g2'],marker=True,marker_value=c1, labels=labels,dir=dir,model_g=model_g,title='2D Histogram for fitted g1,g2', print_pdf=True, x_range=[[0.5,1.4],[1.1,1.9]])
def drawFigure6(dfMea): """Draws Figure 6 (experiment on a simple query)""" colors = ["#bfbfbf", "#7cc8ec", "#868ad1", "#f8d35e", "#f47264"] # Create the main figure. fig = plt.figure(figsize=(10, 4)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) filename = "figure6_simplequery" # For the memory footprints. _drawStackedBars(dfMea, ["inDataX", "inDataY", "midPosXC", "midDataYC"], " [GiB]", "memory footprint", "column", { "inDataX": "X", "inDataY": "Y", "midPosXC": "X'", "midDataYC": "Y'", }, "a", ax1, colors) # For the runtimes. _drawStackedBars(dfMea, ["select", "project", "agg_sum"], " [s]", "runtime", "operator", { "select": "select", "project": "project", "agg_sum": "sum", }, "b", ax2, colors) fig.tight_layout() utils.saveFig(filename) # Create the stand-alone legend. utils.drawLegendRect([ "uncompr.\nuncompr.", "uncompr.\nstatic BP", "static BP\nstatic BP", "DELTA + SIMD-BP\nstatic BP", "FOR + SIMD-BP\nstatic BP" ], colors) utils.saveFig(filename + "_legend")
def CrossSectionCheck2D(dir,c1_g,model_g,data_files,f1_dist,accept_list,c_min,c_max,npoints,n_eval,feature): ''' 2D likelihood plots for a single feature ''' # 2D version csarray = np.linspace(c_min[0],c_max[0],npoints) csarray2 = np.linspace(c_min[1], c_max[1], npoints) all_indexes = np.loadtxt('3indexes_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format(c_min[0],c_min[1],c_max[0],c_max[1],npoints)) all_indexes = np.array([int(x) for x in all_indexes]) all_couplings = np.loadtxt('3couplings_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format(c_min[0],c_min[1],c_max[0],c_max[1],npoints)) all_cross_sections = np.loadtxt('3crosssection_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format(c_min[0],c_min[1],c_max[0],c_max[1],npoints)) basis_files = [data_files[i] for i in all_indexes] samplesdata = [] data_file='data' for i,sample in enumerate(basis_files): samplesdata.append(np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,sample))) print all_indexes targetdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,f1_dist)) likelihoods = np.zeros((npoints,npoints)) n_effs = np.zeros((npoints,npoints)) n_zeros = np.zeros((npoints,npoints)) for k,cs in enumerate(csarray): for j,cs2 in enumerate(csarray2): likelihood,n_eff,n_zero = checkCrossSection(all_couplings[k*npoints+j],all_cross_sections[k*npoints + j],basis_files,f1_dist, dir,c1_g,model_g,feature=feature,targetdata=targetdata,samplesdata=samplesdata) likelihoods[k,j] = likelihood n_effs[k,j] = n_eff n_zeros[k,j] = n_zero #print likelihoods saveFig(csarray,[csarray2,likelihoods],makePlotName('feature{0}'.format(25),'train',type='pixel_g1g2'),labels=['composed'],pixel=True,marker=True,dir=dir,model_g=model_g,marker_value=(1.0,0.5),print_pdf=True,contour=True,title='Feature for g1,g2')
def evalC1C2Likelihood(self,w,testdata,c0,c1,c_eval=0,c_min=0.01,c_max=0.2,use_log=False,true_dist=False, vars_g=None, npoints=50,samples_ids=None,weights_func=None): if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None score = ROOT.RooArgSet(w.var('score')) if use_log == True: evaluateRatio = self.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = self.evaluateDecomposedRatio post = '' csarray = np.linspace(c_min[0],c_max[0],npoints) csarray2 = np.linspace(c_min[1], c_max[1], npoints) decomposedLikelihood = np.zeros((npoints,npoints)) trueLikelihood = np.zeros((npoints,npoints)) c1s = np.zeros(c0.shape[0]) pre_pdf = [] pre_dist = [] pre_pdf.extend([[],[]]) pre_dist.extend([[],[]]) # change this enumerates for k,c0_ in enumerate(c0): pre_pdf[0].append([]) pre_pdf[1].append([]) pre_dist[0].append([]) pre_dist[1].append([]) for j,c1_ in enumerate(c0): index_k,index_j = (self.basis_indexes[k],self.basis_indexes[j]) if k <> j: f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) data = testdata if self.preprocessing == True: data = preProcessing(testdata,self.dataset_names[min(index_k,index_j)], self.dataset_names[max(index_k,index_j)],self.scaler) outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g, self.c1_g,self.model_file,index_k,index_j),data,model_g=self.model_g, clf=self.clf) f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) pre_pdf[0][k].append(f0pdfdist) pre_pdf[1][k].append(f1pdfdist) else: pre_pdf[0][k].append(None) pre_pdf[1][k].append(None) if true_dist == True: f0 = w.pdf('f{0}'.format(k)) f1 = w.pdf('f{0}'.format(j)) if len(testdata.shape) > 1: f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) else: f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) pre_dist[0][k].append(f0dist) pre_dist[1][k].append(f1dist) indices = np.ones(testdata.shape[0], dtype=bool) ratiosList = [] samples = [] # This is needed for calibration of full ratios #for i,sample in enumerate(self.dataset_names): # samples.append(np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(self.dir,'mlp',self.c1_g,'data',sample))) n_eff_ratio = np.zeros((csarray.shape[0], csarray2.shape[0])) for i,cs in enumerate(csarray): ratiosList.append([]) for j, cs2 in enumerate(csarray2): if weights_func <> None: c1s = weights_func(cs,cs2) #print '{0} {1}'.format(cs,cs2) #print c1s else: c1s[:] = c1[:] c1s[c_eval] = cs if self.cross_section <> None: c1s = np.multiply(c1s,self.cross_section) n_eff = c1s.sum() n_tot = np.abs(c1s).sum() n_eff_ratio[i,j] = n_eff/n_tot #print '{0} {1}'.format(i,j) #print 'n_eff: {0}, n_tot: {1}, n_eff/n_tot: {2}'.format(n_eff, n_tot, n_eff/n_tot) c1s = c1s/c1s.sum() #print c1s decomposedRatios,trueRatios = evaluateRatio(w,testdata,x=x, plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist,pre_dist=pre_dist, pre_evaluation=pre_pdf) decomposedRatios = 1./decomposedRatios #calibratedRatios = self.calibrateFullRatios(w, decomposedRatios, # c0,c1s,debug=debug,samples_data=samples,index=i) #saveFig(decomposedRatios2, [calibratedRatios], makePlotName('calibrated_{0}'.format(i),'ratio',type='scat', #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed ratio', #'composed calibrated'], dir=self.dir, model_g=self.model_g) ratiosList[i].append(decomposedRatios) #print('{0} {1} '.format(i,j)), #print decomposedRatios[decomposedRatios < 0.].shape #print c1s #indices = np.logical_and(indices, decomposedRatios > 0.) for i,cs in enumerate(csarray): for j, cs2 in enumerate(csarray2): decomposedRatios = ratiosList[i][j] if use_log == False: if samples_ids <> None: ratios = decomposedRatios ids = samples_ids decomposedLikelihood[i,j] = (np.dot(np.log(ratios), np.array([c1[x] for x in ids]))).sum() else: #decomposedRatios[decomposedRatios < 0.] = 0.9 decomposedRatios[decomposedRatios < 0.] = 1.0 #decomposedRatios = decomposedRatios[self.findOutliers(decomposedRatios)] if n_eff_ratio[i,j] <= 0.5: #TODO: Harcoded number decomposedLikelihood[i,j] = 20000 else: decomposedLikelihood[i,j] = -np.log(decomposedRatios).sum() #print decomposedLikelihood[i,j] #print '{0} {1} {2}'.format(i,j,decomposedLikelihood[i,j]) trueLikelihood[i,j] = -np.log(trueRatios).sum() else: decomposedLikelihood[i,j] = decomposedRatios.sum() trueLikelihood[i,j] = trueRatios.sum() #print '\n {0}'.format(i) decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() decMin = np.unravel_index(decomposedLikelihood.argmin(), decomposedLikelihood.shape) # pixel plots #saveFig(csarray,[csarray2,decomposedLikelihood],makePlotName('comp','train',type='likelihood_g1g2'),labels=['composed'],pixel=True,marker=True,dir=self.dir,model_g=self.model_g,marker_value=(c1[0],c1[1]),print_pdf=True,contour=True,title='Likelihood fit for g1,g2') #decMin = [np.sum(decomposedLikelihood,1).argmin(),np.sum(decomposedLikelihood,0).argmin()] X,Y = np.meshgrid(csarray, csarray2) saveFig(X,[Y,decomposedLikelihood],makePlotName('comp','train',type='multilikelihood'),labels=['composed'],contour=True,marker=True,dir=self.dir,model_g=self.model_g,marker_value=(c1[0],c1[1]),print_pdf=True,min_value=(csarray[decMin[0]],csarray2[decMin[1]])) #print decMin print [csarray[decMin[0]],csarray2[decMin[1]]] pdb.set_trace() if true_dist == True: trueLikelihood = trueLikelihood - trueLikelihood.min() trueMin = np.unravel_index(trueLikelihood.argmin(), trueLikelihood.shape) saveFig(csarray,[decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type=post+'likelihood_{0}'.format(n_sample)),labels=['decomposed','true'],axis=['c1[0]','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[0],title='c1[0] Fitting',print_pdf=True) return [[csarray[trueMin[0]],csarray2[trueMin[1]]], [csarray2[decMin[0],csarray2[decMin[1]]]]] else: return [[0.,0.],[csarray[decMin[0]],csarray2[decMin[1]]]]
def evalDoubleC1C2Likelihood( self, w, testdata, c0, c1, c_eval=0, c_min=0.01, c_max=0.2, use_log=False, true_dist=False, vars_g=None, npoints=50, samples_ids=None, weights_func=None): ''' Find minimum of likelihood on testdata using decomposed ratios and the weighted orthogonal morphing method to find the bases ''' if true_dist: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None score = ROOT.RooArgSet(w.var('score')) if use_log: evaluateRatio = self.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = self.evaluateDecomposedRatio post = '' # Compute bases if they don't exist for this range if not os.path.isfile( '3doubleindexes_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format( c_min[0], c_min[1], c_max[0], c_max[1], npoints)): self.pre2DDoubleBasis(c_min=c_min, c_max=c_max, npoints=npoints) csarray = np.linspace(c_min[0], c_max[0], npoints) csarray2 = np.linspace(c_min[1], c_max[1], npoints) decomposedLikelihood = np.zeros((npoints, npoints)) trueLikelihood = np.zeros((npoints, npoints)) all_indexes = np.loadtxt( '3doubleindexes_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format( c_min[0], c_min[1], c_max[0], c_max[1], npoints)) all_indexes = np.array([[int(x) for x in rows] for rows in all_indexes]) all_couplings = np.loadtxt( '3doublecouplings_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format( c_min[0], c_min[1], c_max[0], c_max[1], npoints)) all_cross_sections = np.loadtxt( '3doublecrosssection_{0:.2f}_{1:.2f}_{2:.2f}_{3:.2f}_{4}.dat'.format( c_min[0], c_min[1], c_max[0], c_max[1], npoints)) # Bkg used in the fit # TODO: Harcoded this have to be changed basis_value = 1 n_eff_ratio = np.zeros((csarray.shape[0], csarray2.shape[0])) n_eff_1s = np.zeros((csarray.shape[0], csarray2.shape[0])) n_eff_2s = np.zeros((csarray.shape[0], csarray2.shape[0])) # Pre evaluate the values for each distribution pre_pdf = [[range(self.nsamples) for _ in range(self.nsamples)], [ range(self.nsamples) for _ in range(self.nsamples)]] pre_dist = [[range(self.nsamples) for _ in range(self.nsamples)], [ range(self.nsamples) for _ in range(self.nsamples)]] # Only precompute distributions that will be used unique_indexes = set() for indexes in all_indexes: unique_indexes |= set(indexes) # change this enumerates unique_indexes = list(unique_indexes) for k in range(len(unique_indexes)): for j in range(len(unique_indexes)): index_k, index_j = (unique_indexes[k], unique_indexes[j]) # This save some time by only evaluating the needed samples if index_k != basis_value: continue print 'Pre computing {0} {1}'.format(index_k, index_j) if k != j: f0pdf = w.function( 'bkghistpdf_{0}_{1}'.format( index_k, index_j)) f1pdf = w.function( 'sighistpdf_{0}_{1}'.format( index_k, index_j)) data = testdata if self.preprocessing: data = preProcessing(testdata, self.dataset_names[min( k, j)], self.dataset_names[max(k, j)], self.scaler) # outputs = # predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g, outputs = predict( '/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format( self.model_file, index_k, index_j), data, model_g=self.model_g) f0pdfdist = np.array( [self.evalDist(score, f0pdf, [xs]) for xs in outputs]) f1pdfdist = np.array( [self.evalDist(score, f1pdf, [xs]) for xs in outputs]) pre_pdf[0][index_k][index_j] = f0pdfdist pre_pdf[1][index_k][index_j] = f1pdfdist else: pre_pdf[0][index_k][index_j] = None pre_pdf[1][index_k][index_j] = None if true_dist: f0 = w.pdf('f{0}'.format(index_k)) f1 = w.pdf('f{0}'.format(index_j)) if len(testdata.shape) > 1: f0dist = np.array([self.evalDist(x, f0, xs) for xs in testdata]) f1dist = np.array([self.evalDist(x, f1, xs) for xs in testdata]) else: f0dist = np.array([self.evalDist(x, f0, [xs]) for xs in testdata]) f1dist = np.array([self.evalDist(x, f1, [xs]) for xs in testdata]) pre_dist[0][index_k][index_j] = f0dist pre_dist[1][index_k][index_j] = f1dist indices = np.ones(testdata.shape[0], dtype=bool) ratiosList = [] samples = [] # Usefull values to inspect after the training alpha = np.zeros([csarray.shape[0], csarray2.shape[0], 2]) n_eff_ratio = np.zeros((csarray.shape[0], csarray2.shape[0])) n_eff_1s = np.zeros((csarray.shape[0], csarray2.shape[0])) n_eff_2s = np.zeros((csarray.shape[0], csarray2.shape[0])) n_tot_1s = np.zeros((csarray.shape[0], csarray2.shape[0])) n_tot_2s = np.zeros((csarray.shape[0], csarray2.shape[0])) n_zeros = np.zeros((npoints, npoints)) target = self.F1_couplings[:] def compute_one_alpha_part(weights, xs): c1s_1 = np.multiply(weights,xs) c1s_1 = np.multiply(weights,c1s_1) alpha1 = c1s_1.sum() return alpha1 exp_basis_weights = True for i, cs in enumerate(csarray): ratiosList.append([]) for j, cs2 in enumerate(csarray2): target[1] = cs target[2] = cs2 print '{0} {1}'.format(i, j) print target # Compute F1 couplings and cross sections c1s_1 = all_couplings[i * npoints + j] cross_section_1 = all_cross_sections[i * npoints + j] c1s_1 = np.multiply(c1s_1, cross_section_1) n_eff = c1s_1.sum() n_tot = np.abs(c1s_1).sum() n_eff_1 = n_eff / n_tot n_eff_1s[i, j] = n_eff_1 n_tot_1s[i, j] = n_tot print 'n_eff 1: {0}'.format(n_eff / n_tot) c1s_1 = c1s_1 / c1s_1.sum() c1s_2 = all_couplings[npoints * npoints + i * npoints + j] cross_section_2 = all_cross_sections[ npoints * npoints + i * npoints + j] c1s_2 = np.multiply(c1s_2, cross_section_2) n_eff = c1s_2.sum() n_tot = np.abs(c1s_2).sum() n_eff_2 = n_eff / n_tot n_eff_2s[i, j] = n_eff_2 n_tot_2s[i, j] = n_tot print 'n_eff 2: {0}'.format(n_eff / n_tot) c1s_2 = c1s_2 / c1s_2.sum() if exp_basis_weights == True: neff2 = 1./n_eff_2 neff1 = 1./n_eff_1 #alpha1 = np.exp(-np.sqrt(neff1)) #alpha2 = np.exp(-np.sqrt(neff2)) alpha1 = np.exp(-neff1**(1./3.)) alpha2 = np.exp(-neff2**(1./3.)) alpha[i,j,0] = alpha1/(alpha1 + alpha2) alpha[i,j,1] = alpha2/(alpha1 + alpha2) else: alpha1 = compute_one_alpha_part(all_couplings[i*npoints + j], all_cross_sections[i*npoints + j]) alpha2 = compute_one_alpha_part(all_couplings[npoints*npoints + i*npoints + j], all_cross_sections[npoints*npoints + i*npoints + j]) alpha[i,j,0] = (1/2.)*(alpha2/(alpha1+alpha2)) alpha[i,j,1] = (1/2.)*(alpha1/(alpha1+alpha2)) # Compute Bkg weights c0_arr_1 = np.zeros(15) c0_arr_2 = np.zeros(15) c0_arr_1[np.where(all_indexes[0] == basis_value)[0][0]] = 1. c0_arr_2[np.where(all_indexes[1] == basis_value)[0][0]] = 1. c0_arr_1 = c0_arr_1 / c0_arr_1.sum() c0_arr_2 = c0_arr_2 / c0_arr_2.sum() c1s = np.append(alpha[i, j, 0] * c1s_1, alpha[i, j, 1] * c1s_2) c0_arr = np.append(0.5 * c0_arr_1, 0.5 * c0_arr_2) print c0_arr cross_section = np.append(cross_section_1, cross_section_2) indexes = np.append(all_indexes[0], all_indexes[1]) completeRatios, trueRatios = evaluateRatio(w, testdata, x=x, plotting=False, roc=False, c0arr=c0_arr, c1arr=c1s, true_dist=true_dist, pre_dist=pre_dist, pre_evaluation=pre_pdf, cross_section=cross_section, indexes=indexes) completeRatios = 1. / completeRatios print completeRatios[completeRatios < 0.].shape n_zeros[i, j] = completeRatios[completeRatios < 0.].shape[0] ratiosList[i].append(completeRatios) n_eff_ratio[i,j] = (alpha[i,j,0] * n_eff_1 + alpha[i,j,1] * n_eff_2) print 'total eff: {0}'.format(n_eff_ratio[i, j]) if n_eff_ratio[i, j] > 0.05: indices = np.logical_and(indices, completeRatios > 0.) print indices[indices].shape[0] for i, cs in enumerate(csarray): for j, cs2 in enumerate(csarray2): completeRatios = ratiosList[i][j] completeRatios = completeRatios[indices] if not use_log: norm = completeRatios[completeRatios != 0.].shape[0] if n_eff_ratio[i, j] < 0.05: # TODO: Harcoded number decomposedLikelihood[i, j] = 20000 else: decomposedLikelihood[ i, j] = -2.*np.log(completeRatios).sum() else: decomposedLikelihood[i, j] = completeRatios.sum() trueLikelihood[i, j] = trueRatios.sum() decomposedLikelihood[decomposedLikelihood == 20000] = decomposedLikelihood[ decomposedLikelihood != 20000].max() decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() decMin = np.unravel_index( decomposedLikelihood.argmin(), decomposedLikelihood.shape) # Plotting # pixel plots saveFig(csarray, [csarray2, n_eff_1s / n_eff_2s], makePlotName('comp', 'train', type='n_eff_ratio'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_rat_1/n_rat_2 values for g1,g2') saveFig(csarray, [csarray2, n_eff_ratio], makePlotName('comp', 'train', type='n_eff'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_eff/n_tot sum values for g1,g2') saveFig(csarray, [csarray2, n_eff_1s], makePlotName('comp', 'train', type='n_eff1'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_eff_1 ratio values for g1,g2') saveFig(csarray, [csarray2, n_eff_2s], makePlotName('comp', 'train', type='n_eff2'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_eff_2 ratiovalues for g1,g2') saveFig(csarray, [csarray2, alpha[:, :, 0]], makePlotName('comp', 'train', type='alpha1'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='weights_1 ratio values for g1,g2') saveFig(csarray, [csarray2, alpha[:, :, 1]], makePlotName('comp', 'train', type='alpha2'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='weights_2 ratiovalues for g1,g2') saveFig(csarray, [csarray2, n_tot_1s], makePlotName('comp', 'train', type='n_tot1'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_tot_1 values for g1,g2') saveFig(csarray, [csarray2, n_tot_2s], makePlotName('comp', 'train', type='n_tot2'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_tot_2 values for g1,g2') saveFig(csarray, [csarray2, n_zeros], makePlotName('comp', 'train', type='n_zeros'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='n_zeros values for g1,g2') saveFig(csarray, [csarray2, decomposedLikelihood], makePlotName('comp', 'train', type='pixel_g1g2'), labels=['composed'], pixel=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=(c1[0], c1[1]), print_pdf=True, contour=True, title='Likelihood fit for g1,g2') #decMin = [np.sum(decomposedLikelihood,1).argmin(),np.sum(decomposedLikelihood,0).argmin()] X, Y = np.meshgrid(csarray, csarray2) saveFig( X, [ Y, decomposedLikelihood], makePlotName( 'comp', 'train', type='multilikelihood_{0:.2f}_{1:.2f}'.format( self.F1_couplings[1], self.F1_couplings[2])), labels=['composed'], contour=True, marker=True, dir=self.dir, model_g=self.model_g, marker_value=( self.F1_couplings[1], self.F1_couplings[2]), print_pdf=True, min_value=( csarray[ decMin[0]], csarray2[ decMin[1]])) # print decMin print [csarray[decMin[0]], csarray2[decMin[1]]] if true_dist: trueLikelihood = trueLikelihood - trueLikelihood.min() trueMin = np.unravel_index( trueLikelihood.argmin(), trueLikelihood.shape) saveFig(csarray, [decomposedLikelihood, trueLikelihood], makePlotName('comp', 'train', type=post + 'likelihood_{0}'.format(n_sample)), labels=['decomposed', 'true'], axis=['c1[0]', '-ln(L)'], marker=True, dir=self.dir, marker_value=c1[0], title='c1[0] Fitting', print_pdf=True) return [[csarray[trueMin[0]], csarray2[trueMin[1]]], [csarray2[decMin[0], csarray2[decMin[1]]]]] else: return [[0., 0.], [csarray[decMin[0]], csarray2[decMin[1]]]]
def drawFigure4(dfMea, selectivity): """Draws Figure 4 (experiment on operator classes)""" fig = plt.figure(figsize=(7.5, 3.5)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) dfUse = dfMea.query("sel == {}".format(selectivity)) rtUncompr = dfUse.query( "operator_class == 'uncompressed'")["runtime [ms]"].mean() rtOtfDrc = dfUse.query( "operator_class == 'otf de/re-compression'")["runtime [ms]"].mean() rtSpecOp = dfUse.query( "operator_class == 'specialized'")["runtime [ms]"].mean() rtOtfMor = dfUse.query( "operator_class == 'otf morphing'")["runtime [ms]"].mean() # Number for the text if False: print("speedup OtfDrc vs. Uncompr: {}".format(rtUncompr / rtOtfDrc)) print("speedup SpecOp vs. OtfDrc: {}".format(rtOtfDrc / rtSpecOp)) print("slowdown OtfMor vs. SpecOp: {}".format(rtOtfMor / rtSpecOp)) sns.barplot( ax=ax1, x="runtime [ms]", y="operator_class_long", order=[VAR_UU, VAR_OTFDRC, VAR_SPEC, VAR_OTFM], data=dfUse, ci=None, ) ax1.set_ylabel(None) runtimeCap = 75 ax1.set_xlim(right=runtimeCap) ax1.text(runtimeCap, 0, "{:.0f} ms → ".format(rtUncompr), horizontalalignment="right", verticalalignment="center", color="white", fontsize=20) sns.barplot( ax=ax2, x="input size [MiB]", y="operator_class_long", order=[VAR_UU, VAR_OTFDRC, VAR_SPEC, VAR_OTFM], data=dfUse, ci=None, ) ax2.set_ylabel(None) ax2.set_yticklabels([]) footprintCap = 512 ax2.set_xlim(right=footprintCap) ax2.set_xticks([0, 128, 256, 384, 512]) ax2.text(footprintCap, 0, "{:.0f} MiB → ".format( dfUse.query("in_data_f == 'uncompr_f'") ["input size [MiB]"].mean()), horizontalalignment="right", verticalalignment="center", color="white", fontsize=20) sns.despine() utils.saveFig("figure4_example")
def evalC1Likelihood(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', c1_g='',model_g='mlp',use_log=False,true_dist=False, vars_g=None): f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) w = f.Get('w') f.Close() if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None score = ROOT.RooArgSet(w.var('score')) if use_log == True: evaluateRatio = test.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = test.evaluateDecomposedRatio post = '' npoints = 25 csarray = np.linspace(0.01,0.10,npoints) testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,'test','F1')) decomposedLikelihood = np.zeros(npoints) trueLikelihood = np.zeros(npoints) c1s = np.zeros(c1.shape[0]) pre_pdfratios = [] pre_ratios = [] for k,c0_ in enumerate(c0): pre_pdfratios.append([]) pre_ratios.append([]) for j,c1_ in enumerate(c1): if k <> j: f0pdf = w.pdf('bkghistpdf_{0}_{1}'.format(k,j)) f1pdf = w.pdf('sighistpdf_{0}_{1}'.format(k,j)) outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g, 'adaptive',k,j),testdata,model_g=model_g) pdfratios = [test.singleRatio(score,f0pdf,f1pdf,[xs]) for xs in outputs] pdfratios = np.array(pdfratios) pre_pdfratios[k].append(pdfratios) else: pre_pdfratios[k].append(None) if true_dist == True: f0 = w.pdf('f{0}'.format(k)) f1 = w.pdf('f{0}'.format(j)) if len(testdata.shape) > 1: ratios = np.array([test.singleRatio(x,f0,f1,xs) for xs in testdata]) else: ratios = np.array([test.singleRatio(x,f0,f1,[xs]) for xs in testdata]) pre_ratios[k].append(ratios) for i,cs in enumerate(csarray): c1s[:] = c1[:] c1s[0] = cs c1s = c1s/c1s.sum() decomposedRatios,trueRatios = evaluateRatio(w,testdata,x=x, plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist,pre_ratios=pre_ratios, pre_pdfratios=pre_pdfratios) if use_log == False: decomposedLikelihood[i] = np.log(decomposedRatios).sum() trueLikelihood[i] = np.log(trueRatios).sum() else: decomposedLikelihood[i] = decomposedRatios.sum() trueLikelihood[i] = trueRatios.sum() decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() if true_dist == True: trueLikelihood = trueLikelihood - trueLikelihood.min() saveFig(csarray,[decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type=post+'likelihood'),labels=['decomposed','true'],axis=['c1[0]','-ln(L)'],marker=True,dir=dir, marker_value=c1[0],title='c1[0] Fitting',print_pdf=True) return (csarray[trueLikelihood.argmin()], csarray[decomposedLikelihood.argmin()]) else: return (0.,csarray[decomposedLikelihood.argmin()])
def computeRatios(workspace,data_file,model_file,dir,model_g,c1_g,true_dist=False, vars_g=None): ''' Use the computed score densities to compute the ratio test. ''' f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) w = f.Get('w') f.Close() print 'Calculating ratios' npoints = 50 score = ROOT.RooArgSet(w.var('score')) getRatio = singleRatio if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) # NN trained on complete model F0pdf = w.function('bkghistpdf_F0_F1') F1pdf = w.function('sighistpdf_F0_F1') data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) testdata = data[:,:-1] testtarget = data[:,-1] ''' # Make ratio considering tumor size unknown ts_idx = 2 target = testdata[0] testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()]) ''' if true_dist == True and len(vars_g) == 1: xarray = np.linspace(1,10,npoints) # TODO: Harcoded dist names F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray]) trueRatio = getRatio(F1dist, F0dist) outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F0fulldist,F1fulldist) saveFig(xarray, [completeRatio, trueRatio], makePlotName('all','train',type='ratio'),title='Density Ratios',labels=['Trained', 'Truth'], print_pdf=True,dir=dir) outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),testdata,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F1fulldist,F0fulldist) complete_target = testtarget #Histogram F0-f0 for composed, full and true # Removing outliers numtest = completeRatio.shape[0] #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.] complete_outliers = np.zeros(numtest,dtype=bool) complete_outliers = findOutliers(completeRatio) complete_target = testtarget[complete_outliers] completeRatio = completeRatio[complete_outliers] bins = 70 low = 0.6 high = 1.2 for l,name in enumerate(['sig','bkg']): minimum = completeRatio[complete_target == 1-l].min() maximum = completeRatio[complete_target == 1-l].max() low = minimum - ((maximum - minimum) / bins)*10 high = maximum + ((maximum - minimum) / bins)*10 w.factory('ratio{0}[{1},{2}]'.format(name, low, high)) ratio_var = w.var('ratio{0}'.format(name)) numtest = completeRatio.shape[0] hist = ROOT.TH1F('{0}hist_F0_f0'.format(name),'hist',bins,low,high) for val in completeRatio[complete_target == 1-l]: hist.Fill(val) datahist = ROOT.RooDataHist('{0}datahist_F0_f0'.format(name),'hist', ROOT.RooArgList(ratio_var),hist) ratio_var.setBins(bins) histpdf = ROOT.RooHistFunc('{0}histpdf_F0_f0'.format(name),'hist', ROOT.RooArgSet(ratio_var), datahist, 0) histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') getattr(w,'import')(hist) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) #print '{0} {1} {2}'.format(curr,name,hist.Integral()) if name == 'bkg': all_ratios_plots = [w.function('sighistpdf_F0_f0'), w.function('bkghistpdf_F0_f0')] all_names_plots = ['sig','bkg'] printFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist',dir=dir,model_g=model_g,c1_g=c1_g),all_names_plots,dir=dir,model_g=model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True) #completeRatio = np.log(completeRatio) completeRatio = completeRatio + np.abs(completeRatio.min()) ratios_list = completeRatio / completeRatio.max() legends_list = ['composed','full'] makeSigBkg([ratios_list],[complete_target],makePlotName('comp','all',type='sigbkg',dir=dir,model_g=model_g,c1_g=c1_g),dir=dir,model_g=model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves') # Make transfer learning data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) # Transforming f1 into f0 data_f1 = data[data[:,-1] == 0.] data_f0 = data[data[:,-1] == 1.] testdata = data_f1[:,:-1] testtarget = data_f1[:,-1] ''' # Make ratio considering tumor size unknown ts_idx = 2 target = testdata[0] testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()]) pdb.set_trace() ''' xarray = testdata outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F0fulldist,F1fulldist) if len(vars_g) == 1: F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray]) else: F1dist = np.array([evalDist(x,w.pdf('f1'),xs) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),xs) for xs in xarray]) trueRatio = getRatio(F1dist, F0dist) trueIndexes = findOutliers(trueRatio) completeIndexes = findOutliers(completeRatio) #indexes = np.logical_and(trueIndexes,completeIndexes) indexes = completeIndexes data_f1_red = data_f1 #trueRatio = trueRatio[indexes] #completeRatio = completeRatio[indexes] #data_f1_red = data_f1[indexes] for f in range(10): feature = f # Transfering distributions # Doing histogram manipulation fig,ax = plt.subplots() colors = ['b-','r-','k-'] colors_rgb = ['blue','red','black'] hist,bins = np.histogram(data_f1[:,feature],bins=20, range=(0.,10.),density=True) hist_transfered,bins_1 = np.histogram(data_f1_red[:,feature],weights=trueRatio,bins=20, range=(0.,10.),density=True) hist_transfered_clf,bins_2 = np.histogram(data_f1_red[:,feature],bins=20,weights=completeRatio, range=(0.,10.),density=True) hist0,bins0 = np.histogram(data_f0[:,feature], bins=20, range=(0.,10.),density=True) #hist, bins = ax.hist(data_f0[:,0],color=colors_rgb[0],label='true',bins=50,histtype='stepfilled',normed=1, alpha=0.5,range=[0,100]) widths = np.diff(bins) #hist_transfered = hist*trueRatio #hist_transfered_clf = hist*completeRatio ax.bar(bins[:-1], hist0,widths,label='f0',alpha=0.5,color='red') #ax.bar(bins[:-1], hist_transfered,widths,label='f1 transfered (true)', # alpha=0.5,color='blue') ax.bar(bins[:-1], hist_transfered_clf,widths,label='f1 transfered (trained)', alpha=0.5,color='green') ax.legend(frameon=False,fontsize=11) ax.set_xlabel('x') ax.set_ylabel('p(x)') if len(vars_g) > 1: ax.set_title('Transfered distributions feature {0}'.format(feature)) else: ax.set_title('Transfered distributions') file_plot = makePlotName('all','transf',type='hist_v{0}'.format(feature),model_g=model_g) fig.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file_plot))
def evalC1C2Likelihood(test,c0,c1,dir='/afs/cern.ch/user/j/jpavezse/systematics', workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', c1_g='',model_g='mlp',use_log=False,true_dist=False,vars_g=None,clf=None, verbose_printing=False): f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) w = f.Get('w') f.Close() if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None score = ROOT.RooArgSet(w.var('score')) if use_log == True: evaluateRatio = test.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = test.evaluateDecomposedRatio post = '' npoints = 25 csarray = np.linspace(0.01,0.2,npoints) cs2array = np.linspace(0.1,0.4,npoints) testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,model_g,c1_g,'test','F1')) decomposedLikelihood = np.zeros((npoints,npoints)) trueLikelihood = np.zeros((npoints,npoints)) c1s = np.zeros(c1.shape[0]) c0s = np.zeros(c1.shape[0]) pre_pdf = [] pre_dist = [] pre_pdf.extend([[],[]]) pre_dist.extend([[],[]]) for k,c0_ in enumerate(c0): pre_pdf[0].append([]) pre_pdf[1].append([]) pre_dist[0].append([]) pre_dist[1].append([]) for j,c1_ in enumerate(c1): if k <> j: f0pdf = w.function('bkghistpdf_{0}_{1}'.format(k,j)) f1pdf = w.function('sighistpdf_{0}_{1}'.format(k,j)) outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(dir,model_g,c1_g, 'adaptive',k,j),testdata,model_g=model_g,clf=clf) f0pdfdist = np.array([test.evalDist(score,f0pdf,[xs]) for xs in outputs]) f1pdfdist = np.array([test.evalDist(score,f1pdf,[xs]) for xs in outputs]) pre_pdf[0][k].append(f0pdfdist) pre_pdf[1][k].append(f1pdfdist) else: pre_pdf[0][k].append(None) pre_pdf[1][k].append(None) if true_dist == True: f0 = w.pdf('f{0}'.format(k)) f1 = w.pdf('f{0}'.format(j)) if len(testdata.shape) > 1: f0dist = np.array([test.evalDist(x,f0,xs) for xs in testdata]) f1dist = np.array([test.evalDist(x,f1,xs) for xs in testdata]) else: f0dist = np.array([test.evalDist(x,f0,[xs]) for xs in testdata]) f1dist = np.array([test.evalDist(x,f1,[xs]) for xs in testdata]) pre_dist[0][k].append(f0dist) pre_dist[1][k].append(f1dist) # Evaluate Likelihood in different c1[0] and c1[1] values for i,cs in enumerate(csarray): for j, cs2 in enumerate(cs2array): c1s[:] = c1[:] c1s[0] = cs c1s[1] = cs2 c1s[2] = 1.-cs-cs2 decomposedRatios,trueRatios = evaluateRatio(w,testdata, x=x,plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist, pre_evaluation=pre_pdf, pre_dist=pre_dist) if use_log == False: decomposedLikelihood[i,j] = np.log(decomposedRatios).sum() trueLikelihood[i,j] = np.log(trueRatios).sum() else: decomposedLikelihood[i,j] = decomposedRatios.sum() trueLikelihood[i,j] = trueRatios.sum() decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() X,Y = np.meshgrid(csarray, cs2array) decMin = np.unravel_index(decomposedLikelihood.argmin(), decomposedLikelihood.shape) min_value = [csarray[decMin[0]],cs2array[decMin[1]]] if verbose_printing == True: saveFig(X,[Y,decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type='multilikelihood'),labels=['composed','true'],contour=True,marker=True,dir=dir,marker_value=(c1[0],c1[1]),print_pdf=True,min_value=min_value) if true_dist == True: trueLikelihood = trueLikelihood - trueLikelihood.min() trueMin = np.unravel_index(trueLikelihood.argmin(), trueLikelihood.shape) return [[csarray[trueMin[0]],cs2array[trueMin[1]]], [csarray[decMin[0]],cs2array[decMin[1]]]] else: return [[0.,0.],[csarray[decMin[0]],cs2array[decMin[1]]]]
def computeRatios(self,true_dist=False, vars_g=None, data_file='test',use_log=False): ''' Use the computed score densities to compute the decomposed ratio test. set true_dist to True if workspace have the true distributions to make plots, in that case vars_g also must be provided Final result is histogram for ratios and signal - bkf rejection curves ''' f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) w = f.Get('w') f.Close() #TODO: This are Harcoded for now c1 = self.c1 c0 = self.c0 #c1 = np.multiply(c1, self.cross_section) c1 = c1/c1.sum() c0 = c0/c0.sum() print 'Calculating ratios' npoints = 50 if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) if use_log == True: evaluateRatio = self.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = self.evaluateDecomposedRatio post = '' score = ROOT.RooArgSet(w.var('score')) scoref = ROOT.RooArgSet(w.var('scoref')) if use_log == True: getRatio = self.singleLogRatio else: getRatio = self.singleRatio if self.preprocessing == True: if self.scaler == None: self.scaler = {} for k in range(self.nsamples): for j in range(self.nsamples): if k < j: self.scaler[(k,j)] = joblib.load('{0}/model/{1}/{2}/{3}_{4}_{5}.dat'.format(self.dir,'mlp',self.c1_g,'scaler',self.dataset_names[k],self.dataset_names[j])) # NN trained on complete model F0pdf = w.function('bkghistpdf_F0_F1') F1pdf = w.function('sighistpdf_F0_F1') # TODO Here assuming that signal is first dataset testdata, testtarget = loadData(data_file,self.F0_dist,0,dir=self.dir,c1_g=self.c1_g,preprocessing=False) if len(vars_g) == 1: xarray = np.linspace(0,5,npoints) fullRatios,_ = evaluateRatio(w,xarray,x=x,plotting=True,roc=False,true_dist=True) F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in xarray]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in xarray]) y2 = getRatio(F1dist, F0dist) # NN trained on complete model outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),xarray.reshape(xarray.shape[0],1),model_g=self.model_g,clf=self.clf) F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) pdfratios = getRatio(F1fulldist, F0fulldist) saveFig(xarray, [fullRatios, y2, pdfratios], makePlotName('all','train',type='ratio'+post),title='Likelihood Ratios',labels=['Composed trained', 'True', 'Full trained'],print_pdf=True,dir=self.dir) if true_dist == True: decomposedRatio,_ = evaluateRatio(w,testdata,x=x,plotting=False,roc=self.verbose_printing,true_dist=True) else: decomposedRatio,_ = evaluateRatio(w,testdata,c0arr=c0,c1arr=c1,plotting=True, roc=True,data_type=data_file) if len(testdata.shape) > 1: outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata,model_g=self.model_g,clf=self.clf) #outputs = predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),testdata,model_g=self.model_g) else: outputs = predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),testdata.reshape(testdata.shape[0],1),model_g=self.model_g,clf=self.clf) F1fulldist = np.array([self.evalDist(scoref,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([self.evalDist(scoref,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F1fulldist,F0fulldist) if true_dist == True: if len(testdata.shape) > 1: F1dist = np.array([self.evalDist(x,w.pdf('F1'),xs) for xs in testdata]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),xs) for xs in testdata]) else: F1dist = np.array([self.evalDist(x,w.pdf('F1'),[xs]) for xs in testdata]) F0dist = np.array([self.evalDist(x,w.pdf('F0'),[xs]) for xs in testdata]) realRatio = getRatio(F1dist,F0dist) decomposed_target = testtarget complete_target = testtarget real_target = testtarget #Histogram F0-f0 for composed, full and true # Removing outliers numtest = decomposedRatio.shape[0] #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.] #decomposed_outliers = np.zeros(numtest,dtype=bool) #complete_outliers = np.zeros(numtest,dtype=bool) #decomposed_outliers = self.findOutliers(decomposedRatio) #complete_outliers = self.findOutliers(completeRatio) #decomposed_target = testtarget[decomposed_outliers] #complete_target = testtarget[complete_outliers] #decomposedRatio = decomposedRatio[decomposed_outliers] #completeRatio = completeRatio[complete_outliers] if true_dist == True: real_outliers = np.zeros(numtest,dtype=bool) real_outliers = self.findOutliers(realRatio) #real_target = testtarget[real_outliers] #realRatio = realRatio[real_outliers] all_ratios_plots = [] all_names_plots = [] bins = 70 low = 0.6 high = 1.2 if use_log == True: low = -1.0 high = 1.0 low = [] high = [] low = [] high = [] ratios_vars = [] for l,name in enumerate(['sig','bkg']): if true_dist == True: ratios_names = ['truth','full','composed'] ratios_vec = [realRatio, completeRatio, decomposedRatio] target_vec = [real_target, complete_target, decomposed_target] minimum = min([realRatio[real_target == 1-l].min(), completeRatio[complete_target == 1-l].min(), decomposedRatio[decomposed_target == 1-l].min()]) maximum = max([realRatio[real_target == 1-l].max(), completeRatio[complete_target == 1-l].max(), decomposedRatio[decomposed_target == 1-l].max()]) else: ratios_names = ['full','composed'] ratios_vec = [completeRatio, decomposedRatio] target_vec = [complete_target, decomposed_target] minimum = min([completeRatio[complete_target == 1-l].min(), decomposedRatio[decomposed_target == 1-l].min()]) maximum = max([completeRatio[complete_target == 1-l].max(), decomposedRatio[decomposed_target == 1-l].max()]) low.append(minimum - ((maximum - minimum) / bins)*10) high.append(maximum + ((maximum - minimum) / bins)*10) w.factory('ratio{0}[{1},{2}]'.format(name, low[l], high[l])) ratios_vars.append(w.var('ratio{0}'.format(name))) for curr, curr_ratios, curr_targets in zip(ratios_names,ratios_vec,target_vec): numtest = curr_ratios.shape[0] for l,name in enumerate(['sig','bkg']): hist = ROOT.TH1F('{0}_{1}hist_F0_f0'.format(curr,name),'hist',bins,low[l],high[l]) for val in curr_ratios[curr_targets == 1-l]: hist.Fill(val) datahist = ROOT.RooDataHist('{0}_{1}datahist_F0_f0'.format(curr,name),'hist', ROOT.RooArgList(ratios_vars[l]),hist) ratios_vars[l].setBins(bins) histpdf = ROOT.RooHistFunc('{0}_{1}histpdf_F0_f0'.format(curr,name),'hist', ROOT.RooArgSet(ratios_vars[l]), datahist, 0) histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') getattr(w,'import')(hist) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) #print '{0} {1} {2}'.format(curr,name,hist.Integral()) if name == 'bkg': all_ratios_plots.append([w.function('{0}_sighistpdf_F0_f0'.format(curr)), w.function('{0}_bkghistpdf_F0_f0'.format(curr))]) all_names_plots.append(['sig_{0}'.format(curr),'bkg_{0}'.format(curr)]) all_ratios_plots = [[all_ratios_plots[j][i] for j,_ in enumerate(all_ratios_plots)] for i,_ in enumerate(all_ratios_plots[0])] all_names_plots = [[all_names_plots[j][i] for j,_ in enumerate(all_names_plots)] for i,_ in enumerate(all_names_plots[0])] printMultiFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),all_names_plots,setLog=True,dir=self.dir,model_g=self.model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True) # scatter plot true ratio - composed - full ratio #if self.verbose_printing == True and true_dist == True: # saveFig(completeRatio,[realRatio], makePlotName('full','train',type='scat'+post,dir=self.dir,model_g=self.model_g,c1_g=self.c1_g),scatter=True,axis=['full trained ratio','true ratio'],dir=self.dir,model_g=self.model_g) # saveFig(decomposedRatio,[realRatio], makePlotName('comp','train',type='scat'+post,dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed trained ratio','true ratio'],dir=self.dir, model_g=self.model_g) # signal - bkg rejection plots if use_log == True: decomposedRatio = np.exp(decomposedRatio) completeRatio = np.exp(completeRatio) if true_dist == True: realRatio = np.exp(realRatio) if true_dist == True: ratios_list = [decomposedRatio/decomposedRatio.max(), completeRatio/completeRatio.max(), realRatio/realRatio.max()] targets_list = [decomposed_target, complete_target, real_target] legends_list = ['composed', 'full', 'true'] else: indices = (decomposedRatio > 0.) decomposedRatio = decomposedRatio[indices] decomposed_target = decomposed_target[indices] indices = (completeRatio > 0.) completeRatio = completeRatio[indices] complete_target = complete_target[indices] completeRatio = np.log(completeRatio) decomposedRatio = np.log(decomposedRatio) decomposedRatio = decomposedRatio + np.abs(decomposedRatio.min()) completeRatio = completeRatio + np.abs(completeRatio.min()) ratios_list = [decomposedRatio/decomposedRatio.max(), completeRatio/completeRatio.max()] targets_list = [decomposed_target, complete_target] legends_list = ['composed','full'] makeSigBkg(ratios_list,targets_list,makePlotName('comp','all',type='sigbkg'+post,dir=self.dir, model_g=self.model_g,c1_g=self.c1_g),dir=self.dir,model_g=self.model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves') # Scatter plot to compare regression function and classifier score if self.verbose_printing == True and true_dist == True: testdata, testtarget = loadData('test',self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g) if len(testdata.shape) > 1: reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),xs) for xs in testdata]) else: reg = np.array([self.__regFunc(x,w.pdf('F0'),w.pdf('F1'),[xs]) for xs in testdata]) if len(testdata.shape) > 1: outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],testdata.shape[1]),model_g=self.model_g, clf=self.clf) else: outputs = predict('{0}/model/{1}/{2}/adaptive_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g),testdata.reshape(testdata.shape[0],1),model_g=self.model_g, clf=self.clf)
def evalC1Likelihood(self,w,testdata,c0,c1,c_eval=0,c_min=0.01,c_max=0.2,use_log=False,true_dist=False, vars_g=None, npoints=50,samples_ids=None,weights_func=None,coef_index=0): if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None score = ROOT.RooArgSet(w.var('score')) if use_log == True: evaluateRatio = self.evaluateLogDecomposedRatio post = 'log' else: evaluateRatio = self.evaluateDecomposedRatio post = '' csarray = np.linspace(c_min,c_max,npoints) decomposedLikelihood = np.zeros(npoints) trueLikelihood = np.zeros(npoints) c1s = np.zeros(c0.shape[0]) pre_pdf = [] pre_dist = [] pre_pdf.extend([[],[]]) pre_dist.extend([[],[]]) # change this enumerates for k in enumerate(self.nsamples): pre_pdf[0].append([]) pre_pdf[1].append([]) pre_dist[0].append([]) pre_dist[1].append([]) for j in enumerate(self.nsamples): index_k,index_j = (self.basis_indexes[k],self.basis_indexes[j]) if k <> j: f0pdf = w.function('bkghistpdf_{0}_{1}'.format(index_k,index_j)) f1pdf = w.function('sighistpdf_{0}_{1}'.format(index_k,index_j)) data = testdata if self.preprocessing == True: data = preProcessing(testdata,self.dataset_names[min(index_k,index_j)], self.dataset_names[max(index_k,index_j)],self.scaler) outputs = predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g, self.c1_g,self.model_file,index_k,index_j),data,model_g=self.model_g, clf=self.clf) f0pdfdist = np.array([self.evalDist(score,f0pdf,[xs]) for xs in outputs]) f1pdfdist = np.array([self.evalDist(score,f1pdf,[xs]) for xs in outputs]) pre_pdf[0][k].append(f0pdfdist) pre_pdf[1][k].append(f1pdfdist) else: pre_pdf[0][k].append(None) pre_pdf[1][k].append(None) if true_dist == True: f0 = w.pdf('f{0}'.format(index_k)) f1 = w.pdf('f{0}'.format(index_j)) if len(testdata.shape) > 1: f0dist = np.array([self.evalDist(x,f0,xs) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,xs) for xs in testdata]) else: f0dist = np.array([self.evalDist(x,f0,[xs]) for xs in testdata]) f1dist = np.array([self.evalDist(x,f1,[xs]) for xs in testdata]) pre_dist[0][k].append(f0dist) pre_dist[1][k].append(f1dist) indices = np.ones(testdata.shape[0], dtype=bool) ratiosList = [] samples = [] # This is needed for calibration of full ratios #for i,sample in enumerate(self.dataset_names): # samples.append(np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(self.dir,'mlp',self.c1_g,'data',sample))) #cross_section = self.cross_section / np.sum(self.cross_section) n_eff_ratio = np.zeros(csarray.shape[0]) n_zeros = np.zeros(csarray.shape[0]) cross_section = None for i,cs in enumerate(csarray): if weights_func <> None: c1s = weights_func(cs,c1[1]) if coef_index == 0 else weights_func(c1[0],cs) print '{0} {1}'.format(cs, c1[1]) if coef_index == 0 else '{0} {1}'.format(c1[0],cs) print c1s else: c1s[:] = c1[:] c1s[c_eval] = cs if self.cross_section <> None: c1s = np.multiply(c1s,self.cross_section) #c1s = np.abs(c1s) n_eff = c1s.sum() n_tot = np.abs(c1s).sum() print 'n_eff: {0}, n_tot: {1}, n_eff/n_tot: {2}'.format(n_eff, n_tot, n_eff/n_tot) c1s = c1s/c1s.sum() decomposedRatios,trueRatios = evaluateRatio(w,testdata,x=x, plotting=False,roc=False,c0arr=c0,c1arr=c1s,true_dist=true_dist,pre_dist=pre_dist, pre_evaluation=pre_pdf,cross_section=cross_section) decomposedRatios = 1./decomposedRatios n_eff_ratio[i] = n_eff/n_tot n_zeros[i] = decomposedRatios[decomposedRatios < 0.].shape[0] print decomposedRatios[decomposedRatios < 0.].shape #calibratedRatios = self.calibrateFullRatios(w, decomposedRatios, # c0,c1s,debug=debug,samples_data=samples,index=i) #saveFig(decomposedRatios2, [calibratedRatios], makePlotName('calibrated_{0}'.format(i),'ratio',type='scat', #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['composed ratio', #'composed calibrated'], dir=self.dir, model_g=self.model_g) ratiosList.append(decomposedRatios) #indices = np.logical_and(indices, decomposedRatios > 0.) for i,cs in enumerate(csarray): decomposedRatios = ratiosList[i] if use_log == False: if samples_ids <> None: ratios = decomposedRatios ids = samples_ids decomposedLikelihood[i] = (np.dot(np.log(ratios), np.array([c1[x] for x in ids]))).sum() else: decomposedRatios[decomposedRatios < 0.] = 1.0 decomposedLikelihood[i] = -np.log(decomposedRatios).sum() print decomposedLikelihood[i] trueLikelihood[i] = -np.log(trueRatios).sum() else: decomposedLikelihood[i] = decomposedRatios.sum() trueLikelihood[i] = trueRatios.sum() decomposedLikelihood = decomposedLikelihood - decomposedLikelihood.min() # print n_eff/n_zero relation #saveFig(csarray,[n_eff_ratio, n_zeros/n_zeros.max()],makePlotName('eff_ratio','zeros',type=post+'plot_g2'),labels=['n_eff/n_tot','zeros/{0}'.format(n_zeros.max())],axis=['g2','values'],marker=True,dir=self.dir,marker_value=c1[0],title='#zeros and n_eff/n_tot given g2',print_pdf=True,model_g=self.model_g) #saveFig(n_eff_ratio, [n_zeros/n_zeros.max()], makePlotName('eff_ratio','zeros',type='scat', #dir=self.dir, model_g=self.model_g, c1_g=self.c1_g),scatter=True, axis=['n_eff/n_tot', #'#zeros/{0}'.format(n_zeros.max())], dir=self.dir, model_g=self.model_g,title='# zeros given n_eff/n_tot ratio') if true_dist == True: trueLikelihood = trueLikelihood - trueLikelihood.min() saveFig(csarray,[decomposedLikelihood,trueLikelihood],makePlotName('comp','train',type=post+'likelihood_{0}'.format(n_sample)),labels=['decomposed','true'],axis=['c1[0]','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[0],title='c1[0] Fitting',print_pdf=True) return (csarray[trueLikelihood.argmin()], csarray[decomposedLikelihood.argmin()]) else: saveFig(csarray,[decomposedLikelihood],makePlotName('comp','train',type='likelihood_g2'),labels=['decomposed'],axis=['g2','-ln(L)'],marker=True,dir=self.dir,marker_value=c1[c_eval],title='g2 Fitting',print_pdf=True,model_g=self.model_g) pdb.set_trace() return (0.,csarray[decomposedLikelihood.argmin()])