def makeModelND(vars_g,cov_l=cov_g,mu_l=mu_g, workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', verbose_printing=False): ''' RooFit statistical model for the data ''' # Statistical model w = ROOT.RooWorkspace('w') print 'Generating initial distributions' cov_m = [] mu_m = [] mu_str = [] cov_root = [] vec = [] argus = ROOT.RooArgList() #features for i,var in enumerate(vars_g): w.factory('{0}[{1},{2}]'.format(var,-25,30)) argus.add(w.var(var)) for glob in range(2): # generate covariance matrix cov_m.append(np.matrix(cov_l[glob])) cov_root.append(ROOT.TMatrixDSym(len(vars_g))) for i,var1 in enumerate(vars_g): for j,var2 in enumerate(vars_g): cov_root[-1][i][j] = cov_m[-1][i,j] getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob)) # generate mu vector mu_m.append(np.array(mu_l[glob])) vec.append(ROOT.TVectorD(len(vars_g))) for i, mu in enumerate(mu_m[-1]): vec[-1][i] = mu mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) # multivariate gaussian gaussian = ROOT.RooMultiVarGaussian('f{0}'.format(glob), 'f{0}'.format(glob),argus,vec[-1],cov_root[-1]) getattr(w,'import')(gaussian) # Check Model w.Print() w.writeToFile('{0}/{1}'.format(dir,workspace)) if verbose_printing == True: printFrame(w,['x0','x1','x7','x8'],[w.pdf('f0'),w.pdf('f1')],'distributions',['f0','f1'] ,dir=dir,model_g=model_g,range=[-15,20],title='Distributions',x_text='x0',y_text='p(x)',print_pdf=True) return w
def checkCrossSection(c1,cross_section,samples,target,dir,c1_g,model_g,feature=0): w = ROOT.RooWorkspace('w') normalizer = (np.abs(np.multiply(c1,cross_section))).sum() normalizer = (np.multiply(c1,cross_section)).sum() #normalizer = cross_section.sum() # load S(1,1.5) data data_file = 'data' testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,target)) testdata = testdata[:,feature] bins = 300 low = 0. high = 250. w.factory('score[{0},{1}]'.format(low,high)) s = w.var('score') target_hist = ROOT.TH1F('targethist','targethist',bins,low,high) for val in testdata: target_hist.Fill(val) norm = 1./target_hist.Integral() target_hist.Scale(norm) samples_hists = [] sum_hist = ROOT.TH1F('sampleshistsum','sampleshistsum',bins,low,high) for i,sample in enumerate(samples): samples_hist = ROOT.TH1F('sampleshist{0}'.format(i),'sampleshist',bins,low,high) testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,sample)) testdata = testdata[:,feature] weight = np.abs((c1[i] * cross_section[i]))/normalizer weight = (c1[i] * cross_section[i])/normalizer for val in testdata: samples_hist.Fill(val) #samples_hist.Fill(val,weight) norm = 1./samples_hist.Integral() samples_hist.Scale(norm) samples_hists.append(samples_hist) sum_hist.Add(samples_hist,weight) target_datahist = ROOT.RooDataHist('{0}datahist'.format('target'),'histtarget', ROOT.RooArgList(s),target_hist) target_histpdf = ROOT.RooHistFunc('{0}histpdf'.format('target'),'histtarget', ROOT.RooArgSet(s), target_datahist, 0) #xarray = np.linspace(low, high, bins) #score = ROOT.RooArgSet(s) #test_values = np.array([evalDist(score,target_histpdf,[xs]) for xs in xarray]) samples_datahist = ROOT.RooDataHist('{0}datahist'.format('samples'),'histsamples', ROOT.RooArgList(s),sum_hist) samples_histpdf = ROOT.RooHistFunc('{0}histpdf'.format('samples'),'histsamples', ROOT.RooArgSet(s), samples_datahist, 0) printFrame(w,['score'],[target_histpdf,samples_histpdf],'check_cross_section_{0}'.format(feature),['real','weighted'], dir=dir, model_g=model_g,title='cross section check',x_text='x',y_text='dN')
def makeModel(c0,c1,cov_l=cov_g,mu_l=mu_g, workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', c1_g='',verbose_printing=False): ''' RooFit statistical model for the data ''' # Statistical model w = ROOT.RooWorkspace('w') #w.factory("EXPR::f1('cos(x)**2 + .01',x)") w.factory("EXPR::f2('exp(x*-1)',x[0,5])") w.factory("EXPR::f1('0.3 + exp(-(x-5)**2/5.)',x)") w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x)") #w.factory("EXPR::f2('exp(-(x-2)**2/2)',x)") w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1])) w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1])) # Check Model w.Print() w.writeToFile('{0}/workspace_DecomposingTestOfMixtureModelsClassifiers.root'.format(dir)) if verbose_printing == True: printFrame(w,['x'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2'] ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)', print_pdf=True) printFrame(w,['x'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'], dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)',print_pdf=True) printFrame(w,['x'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'], dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)', print_pdf=True)
def makeModel( workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', verbose_printing=False): ''' RooFit statistical model for the data ''' # Statistical model w = ROOT.RooWorkspace('w') #w.factory("EXPR::f1('cos(x)**2 + .01',x)") w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x[0,10])") w.factory("EXPR::f1('exp(-(x-5.5)**2/5.)',x)") #w.factory("SUM::f2(c1[0.5]*f0,c2[0.5]*f1)") # Check Model w.Print() w.writeToFile('{0}/{1}'.format(dir,workspace)) if verbose_printing == True: printFrame(w,['x'],[w.pdf('f0'),w.pdf('f1')],'transfered',['gaussian','transfered'] ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)', print_pdf=True)
def fit(input_workspace,dir,model_g='mlp',c1_g='breast',data_file='data', model_file='train',verbose_printing=True): bins = 80 low = 0. high = 1. if input_workspace <> None: f = ROOT.TFile('{0}/{1}'.format(dir,input_workspace)) w = f.Get('w') # TODO test this when workspace is present w = ROOT.RooWorkspace('w') if w == None else w f.Close() else: w = ROOT.RooWorkspace('w') w.Print() print 'Generating Score Histograms' w.factory('score[{0},{1}]'.format(low,high)) s = w.var('score') def saveHisto(w,outputs,s,bins,low,high,k='F0',j='F1'): print 'Estimating {0} {1}'.format(k,j) for l,name in enumerate(['sig','bkg']): data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data", ROOT.RooArgSet(s)) hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high) values = outputs[l] #values = values[self.findOutliers(values)] for val in values: hist.Fill(val) s.setVal(val) data.add(ROOT.RooArgSet(s)) norm = 1./hist.Integral() hist.Scale(norm) s.setBins(bins) datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgList(s),hist) histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgSet(s), datahist, 1) getattr(w,'import')(hist) getattr(w,'import')(data) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) score_str = 'score' # Calculate the density of the classifier output using kernel density #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str)) # Full model data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) traindata = data[:,:-1] targetdata = data[:,-1] numtrain = traindata.shape[0] size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/transfer_learning/{0}_F0_F1.pkl'.format(model_file),traindata[targetdata==1],model_g=model_g), predict('/afs/cern.ch/work/j/jpavezse/private/transfer_learning/{0}_F0_F1.pkl'.format(model_file),traindata[targetdata==0],model_g=model_g)] saveHisto(w,outputs,s, bins, low, high) if verbose_printing == True: printFrame(w,['score'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=dir,c1_g=c1_g,model_g=model_g),['signal','bkg'], dir=dir,model_g=model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') w.writeToFile('{0}/{1}'.format(dir,input_workspace)) w.Print()
def computeRatios(workspace,data_file,model_file,dir,model_g,c1_g,true_dist=False, vars_g=None): ''' Use the computed score densities to compute the ratio test. ''' f = ROOT.TFile('{0}/{1}'.format(dir,workspace)) w = f.Get('w') f.Close() print 'Calculating ratios' npoints = 50 score = ROOT.RooArgSet(w.var('score')) getRatio = singleRatio if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) # NN trained on complete model F0pdf = w.function('bkghistpdf_F0_F1') F1pdf = w.function('sighistpdf_F0_F1') data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) testdata = data[:,:-1] testtarget = data[:,-1] ''' # Make ratio considering tumor size unknown ts_idx = 2 target = testdata[0] testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()]) ''' if true_dist == True and len(vars_g) == 1: xarray = np.linspace(1,10,npoints) # TODO: Harcoded dist names F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray]) trueRatio = getRatio(F1dist, F0dist) outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F0fulldist,F1fulldist) saveFig(xarray, [completeRatio, trueRatio], makePlotName('all','train',type='ratio'),title='Density Ratios',labels=['Trained', 'Truth'], print_pdf=True,dir=dir) outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),testdata,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F1fulldist,F0fulldist) complete_target = testtarget #Histogram F0-f0 for composed, full and true # Removing outliers numtest = completeRatio.shape[0] #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.] complete_outliers = np.zeros(numtest,dtype=bool) complete_outliers = findOutliers(completeRatio) complete_target = testtarget[complete_outliers] completeRatio = completeRatio[complete_outliers] bins = 70 low = 0.6 high = 1.2 for l,name in enumerate(['sig','bkg']): minimum = completeRatio[complete_target == 1-l].min() maximum = completeRatio[complete_target == 1-l].max() low = minimum - ((maximum - minimum) / bins)*10 high = maximum + ((maximum - minimum) / bins)*10 w.factory('ratio{0}[{1},{2}]'.format(name, low, high)) ratio_var = w.var('ratio{0}'.format(name)) numtest = completeRatio.shape[0] hist = ROOT.TH1F('{0}hist_F0_f0'.format(name),'hist',bins,low,high) for val in completeRatio[complete_target == 1-l]: hist.Fill(val) datahist = ROOT.RooDataHist('{0}datahist_F0_f0'.format(name),'hist', ROOT.RooArgList(ratio_var),hist) ratio_var.setBins(bins) histpdf = ROOT.RooHistFunc('{0}histpdf_F0_f0'.format(name),'hist', ROOT.RooArgSet(ratio_var), datahist, 0) histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') getattr(w,'import')(hist) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) #print '{0} {1} {2}'.format(curr,name,hist.Integral()) if name == 'bkg': all_ratios_plots = [w.function('sighistpdf_F0_f0'), w.function('bkghistpdf_F0_f0')] all_names_plots = ['sig','bkg'] printFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist',dir=dir,model_g=model_g,c1_g=c1_g),all_names_plots,dir=dir,model_g=model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True) #completeRatio = np.log(completeRatio) completeRatio = completeRatio + np.abs(completeRatio.min()) ratios_list = completeRatio / completeRatio.max() legends_list = ['composed','full'] makeSigBkg([ratios_list],[complete_target],makePlotName('comp','all',type='sigbkg',dir=dir,model_g=model_g,c1_g=c1_g),dir=dir,model_g=model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves') # Make transfer learning data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) # Transforming f1 into f0 data_f1 = data[data[:,-1] == 0.] data_f0 = data[data[:,-1] == 1.] testdata = data_f1[:,:-1] testtarget = data_f1[:,-1] ''' # Make ratio considering tumor size unknown ts_idx = 2 target = testdata[0] testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()]) pdb.set_trace() ''' xarray = testdata outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g) F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs]) F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs]) completeRatio = getRatio(F0fulldist,F1fulldist) if len(vars_g) == 1: F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray]) else: F1dist = np.array([evalDist(x,w.pdf('f1'),xs) for xs in xarray]) F0dist = np.array([evalDist(x,w.pdf('f0'),xs) for xs in xarray]) trueRatio = getRatio(F1dist, F0dist) trueIndexes = findOutliers(trueRatio) completeIndexes = findOutliers(completeRatio) #indexes = np.logical_and(trueIndexes,completeIndexes) indexes = completeIndexes data_f1_red = data_f1 #trueRatio = trueRatio[indexes] #completeRatio = completeRatio[indexes] #data_f1_red = data_f1[indexes] for f in range(10): feature = f # Transfering distributions # Doing histogram manipulation fig,ax = plt.subplots() colors = ['b-','r-','k-'] colors_rgb = ['blue','red','black'] hist,bins = np.histogram(data_f1[:,feature],bins=20, range=(0.,10.),density=True) hist_transfered,bins_1 = np.histogram(data_f1_red[:,feature],weights=trueRatio,bins=20, range=(0.,10.),density=True) hist_transfered_clf,bins_2 = np.histogram(data_f1_red[:,feature],bins=20,weights=completeRatio, range=(0.,10.),density=True) hist0,bins0 = np.histogram(data_f0[:,feature], bins=20, range=(0.,10.),density=True) #hist, bins = ax.hist(data_f0[:,0],color=colors_rgb[0],label='true',bins=50,histtype='stepfilled',normed=1, alpha=0.5,range=[0,100]) widths = np.diff(bins) #hist_transfered = hist*trueRatio #hist_transfered_clf = hist*completeRatio ax.bar(bins[:-1], hist0,widths,label='f0',alpha=0.5,color='red') #ax.bar(bins[:-1], hist_transfered,widths,label='f1 transfered (true)', # alpha=0.5,color='blue') ax.bar(bins[:-1], hist_transfered_clf,widths,label='f1 transfered (trained)', alpha=0.5,color='green') ax.legend(frameon=False,fontsize=11) ax.set_xlabel('x') ax.set_ylabel('p(x)') if len(vars_g) > 1: ax.set_title('Transfered distributions feature {0}'.format(feature)) else: ax.set_title('Transfered distributions') file_plot = makePlotName('all','transf',type='hist_v{0}'.format(feature),model_g=model_g) fig.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file_plot))
def fit(self, data_file='test',importance_sampling=False, true_dist=True,vars_g=None): ''' Create pdfs for the classifier score to be used later on the ratio test, input workspace only needed in case there exist true pdfs for the distributions the models being used are ./model/{model_g}/{c1_g}/{model_file}_i_j.pkl and the data files are ./data/{model_g}/{c1_g}/{data_file}_i_j.dat ''' bins = 40 low = 0. high = 1. if self.input_workspace <> None: #f = ROOT.TFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace)) w = f.Get('w') # TODO test this when workspace is present w = ROOT.RooWorkspace('w') if w == None else w f.Close() else: w = ROOT.RooWorkspace('w') w.Print() print 'Generating Score Histograms' w.factory('score[{0},{1}]'.format(low,high)) s = w.var('score') if importance_sampling == True: if true_dist == True: vars = ROOT.TList() for var in vars_g: vars.Add(w.var(var)) x = ROOT.RooArgSet(vars) else: x = None #This is because most of the data of the full model concentrate around 0 bins_full = 40 low_full = 0. high_full = 1. w.factory('scoref[{0},{1}]'.format(low_full, high_full)) s_full = w.var('scoref') histos = [] histos_names = [] inv_histos = [] inv_histos_names = [] sums_histos = [] def saveHistos(w,outputs,s,bins,low,high,pos=None,importance_sampling=False,importance_data=None, importance_outputs=None): if pos <> None: k,j = pos else: k,j = ('F0','F1') print 'Estimating {0} {1}'.format(k,j) for l,name in enumerate(['sig','bkg']): data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data", ROOT.RooArgSet(s)) hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high) values = outputs[l] #values = values[self.findOutliers(values)] for val in values: hist.Fill(val) s.setVal(val) data.add(ROOT.RooArgSet(s)) norm = 1./hist.Integral() hist.Scale(norm) s.setBins(bins) datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgList(s),hist) #histpdf = ROOT.RooHistPdf('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', # ROOT.RooArgSet(s), datahist, 1) histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist', ROOT.RooArgSet(s), datahist, 1) #histpdf.setUnitNorm(True) #testvalues = np.array([self.evalDist(ROOT.RooArgSet(s), histpdf, [xs]) for xs in values]) #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator') #print 'INTEGRAL' #print histpdf.createIntegral(ROOT.RooArgSet(s)).getVal() #print histpdf.Integral() #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooAdaptiveGaussKronrodIntegrator1D') getattr(w,'import')(hist) getattr(w,'import')(data) getattr(w,'import')(datahist) # work around for morph = w.import(morph) getattr(w,'import')(histpdf) # work around for morph = w.import(morph) score_str = 'scoref' if pos == None else 'score' # Calculate the density of the classifier output using kernel density #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str)) # Print histograms pdfs and estimated densities if self.verbose_printing == True and name == 'bkg' and k <> j: full = 'full' if pos == None else 'dec' if k < j and k <> 'F0': histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) if j < k and k <> 'F0': inv_histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))]) inv_histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)]) if self.scaler == None: self.scaler = {} # change this for k in range(self.nsamples): for j in range(self.nsamples): if k == j: continue #if k <> 2 and j <> 2: # continue if self.dataset_names <> None: name_k, name_j = (self.dataset_names[k], self.dataset_names[j]) else: name_k, name_j = (k,j) print 'Loading {0}:{1} {2}:{3}'.format(k,name_k, j,name_j) traindata, targetdata = loadData(data_file,name_k,name_j,dir=self.dir,c1_g=self.c1_g, preprocessing=self.preprocessing,scaler=self.scaler,persist=True) numtrain = traindata.shape[0] size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 #output = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 1],model_g=self.model_g), # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 0],model_g=self.model_g)] output = [predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] saveHistos(w,output,s,bins,low,high,(k,j)) #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) if self.verbose_printing==True: for ind in range(1,(len(histos)/3+1)): print_histos = histos[(ind-1)*3:(ind-1)*3+3] print_histos_names = histos_names[(ind-1)*3:(ind-1)*3+3] printMultiFrame(w,['score']*len(print_histos),print_histos, makePlotName('dec{0}'.format(ind-1),'all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),print_histos_names, dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') # Full model traindata, targetdata = loadData(data_file,self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g, preprocessing=self.preprocessing, scaler=self.scaler) numtrain = traindata.shape[0] size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1 outputs = [predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==1],model_g=self.model_g,clf=self.clf), predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)] #outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==1],model_g=self.model_g), # predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==0],model_g=self.model_g)] saveHistos(w,outputs,s_full, bins_full, low_full, high_full,importance_sampling=False) if self.verbose_printing == True: printFrame(w,['scoref'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),['signal','bkg'], dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions') #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace)) w.writeToFile('{0}/{1}'.format(self.dir,self.workspace)) w.Print()
def makeModelPrivateND(vars_g,c0, c1, n_private=3, coeffs=coeffs_g,cov_l=cov_g, mu_l=mu_g, workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp', c1_g='',verbose_printing=False,load_cov=False): ''' RooFit statistical model for the data ''' # Statistical model w = ROOT.RooWorkspace('w') print 'Generating initial distributions' cov_m = [] mu_m = [] mu_str = [] cov_root = [] vec = [] argus = ROOT.RooArgList() # features for i,var in enumerate(vars_g): w.factory('{0}[{1},{2}]'.format(var,-25,30)) argus.add(w.var(var)) n = len(cov_l[0][0]) for glob in range(3): for priv in range(n_private): if load_cov == False: cov_i = np.random.random((n,n)) cov_i = cov_i + cov_i.transpose() cov_i = cov_i + n*np.eye(n) np.savetxt('{0}/covariance_{1}_{2}.txt'.format(dir,glob,priv), cov_i,fmt='%f') else: cov_i = np.matrix(np.loadtxt('{0}/data/covariance_{1}_{2}.txt'.format( dir,glob,priv))) print cov_i # generate covriance matrix cov_m.append(cov_i) cov_root.append(ROOT.TMatrixDSym(len(vars_g))) for i,var1 in enumerate(vars_g): for j,var2 in enumerate(vars_g): if i <= j: cov_root[-1][i][j] = cov_m[-1][i,j] else: cov_root[-1][i][j] = cov_m[-1][j,i] getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob*3 + priv)) # generate mu vectors mu_m.append(np.array(mu_l[glob]) + meansum[glob][priv]) vec.append(ROOT.TVectorD(len(vars_g))) for i, mu in enumerate(mu_m[-1]): vec[-1][i] = mu mu_str.append(','.join([str(mu) for mu in mu_m[-1]])) # create multivariate gaussian gaussian = ROOT.RooMultiVarGaussian('f{0}_{1}'.format(glob,priv), 'f{0}_{1}'.format(glob,priv),argus,vec[-1],cov_root[-1]) getattr(w,'import')(gaussian) # create private mixture model priv_coeffs = np.array(coeffs[glob]) #print 'priv coef {0} {1}'.format(priv_coeffs, priv_coeffs.sum()) sum_str = ','.join(['c_{0}_{1}[{2}]*f{0}_{1}'.format(glob,j,priv_coeffs[j]) for j in range(n_private)]) w.factory('SUM::f{0}({1})'.format(glob,sum_str)) #mixture model w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1])) w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1])) # Check Model w.Print() w.writeToFile('{0}/{1}'.format(dir,workspace)) if verbose_printing == True: printFrame(w,['x0','x1','x2'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2'] ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)') printFrame(w,['x0','x1','x2'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'], dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)') printFrame(w,['x0','x1','x2'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'], dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)') return w