def makeModelND(vars_g,cov_l=cov_g,mu_l=mu_g,
    workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', 
    dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp',
    verbose_printing=False):
  '''
  RooFit statistical model for the data
  
  '''  
  # Statistical model
  w = ROOT.RooWorkspace('w')

  print 'Generating initial distributions'
  cov_m = []
  mu_m = []
  mu_str = []
  cov_root = []
  vec = []
  argus = ROOT.RooArgList() 
  #features
  for i,var in enumerate(vars_g):
    w.factory('{0}[{1},{2}]'.format(var,-25,30))
    argus.add(w.var(var))

  for glob in range(2):
    # generate covariance matrix
    cov_m.append(np.matrix(cov_l[glob]))
    cov_root.append(ROOT.TMatrixDSym(len(vars_g)))
    for i,var1 in enumerate(vars_g):
      for j,var2 in enumerate(vars_g):
        cov_root[-1][i][j] = cov_m[-1][i,j]
    getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob))
    # generate mu vector
    mu_m.append(np.array(mu_l[glob]))
    vec.append(ROOT.TVectorD(len(vars_g)))
    for i, mu in enumerate(mu_m[-1]):
      vec[-1][i] = mu
    mu_str.append(','.join([str(mu) for mu in mu_m[-1]]))
    # multivariate gaussian
    gaussian = ROOT.RooMultiVarGaussian('f{0}'.format(glob),
          'f{0}'.format(glob),argus,vec[-1],cov_root[-1])
    getattr(w,'import')(gaussian)
  # Check Model
  w.Print()

  w.writeToFile('{0}/{1}'.format(dir,workspace))
  if verbose_printing == True:
    printFrame(w,['x0','x1','x7','x8'],[w.pdf('f0'),w.pdf('f1')],'distributions',['f0','f1']
    ,dir=dir,model_g=model_g,range=[-15,20],title='Distributions',x_text='x0',y_text='p(x)',print_pdf=True)


  return w
Ejemplo n.º 2
0
def checkCrossSection(c1,cross_section,samples,target,dir,c1_g,model_g,feature=0):
  w = ROOT.RooWorkspace('w')
  normalizer = (np.abs(np.multiply(c1,cross_section))).sum()
  normalizer = (np.multiply(c1,cross_section)).sum()
  #normalizer = cross_section.sum()

  # load S(1,1.5) data
  data_file = 'data'
  testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,target))
  testdata = testdata[:,feature]
  bins = 300
  low = 0.
  high = 250.
  w.factory('score[{0},{1}]'.format(low,high))
  s = w.var('score')
  target_hist = ROOT.TH1F('targethist','targethist',bins,low,high)
  for val in testdata:
    target_hist.Fill(val)
  norm = 1./target_hist.Integral()
  target_hist.Scale(norm) 

  samples_hists = []
  sum_hist = ROOT.TH1F('sampleshistsum','sampleshistsum',bins,low,high)
  for i,sample in enumerate(samples):
    samples_hist = ROOT.TH1F('sampleshist{0}'.format(i),'sampleshist',bins,low,high)
    testdata = np.loadtxt('{0}/data/{1}/{2}/{3}_{4}.dat'.format(dir,'mlp',c1_g,data_file,sample))
    testdata = testdata[:,feature]
    weight = np.abs((c1[i] * cross_section[i]))/normalizer
    weight = (c1[i] * cross_section[i])/normalizer
    for val in testdata:
      samples_hist.Fill(val)
      #samples_hist.Fill(val,weight)
    norm = 1./samples_hist.Integral()
    samples_hist.Scale(norm) 
    samples_hists.append(samples_hist)
    sum_hist.Add(samples_hist,weight)  
  
  target_datahist = ROOT.RooDataHist('{0}datahist'.format('target'),'histtarget',
        ROOT.RooArgList(s),target_hist)
  target_histpdf = ROOT.RooHistFunc('{0}histpdf'.format('target'),'histtarget',
        ROOT.RooArgSet(s), target_datahist, 0)
  #xarray = np.linspace(low, high, bins) 
  #score = ROOT.RooArgSet(s)
  #test_values = np.array([evalDist(score,target_histpdf,[xs]) for xs in xarray])
  samples_datahist = ROOT.RooDataHist('{0}datahist'.format('samples'),'histsamples',
        ROOT.RooArgList(s),sum_hist)
  samples_histpdf = ROOT.RooHistFunc('{0}histpdf'.format('samples'),'histsamples',
        ROOT.RooArgSet(s), samples_datahist, 0)
  
  printFrame(w,['score'],[target_histpdf,samples_histpdf],'check_cross_section_{0}'.format(feature),['real','weighted'],
    dir=dir, model_g=model_g,title='cross section check',x_text='x',y_text='dN')
Ejemplo n.º 3
0
def makeModel(c0,c1,cov_l=cov_g,mu_l=mu_g,
    workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', 
    dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp',
    c1_g='',verbose_printing=False):
  '''
  RooFit statistical model for the data
  
  '''  
  # Statistical model
  w = ROOT.RooWorkspace('w')
  #w.factory("EXPR::f1('cos(x)**2 + .01',x)")
  w.factory("EXPR::f2('exp(x*-1)',x[0,5])")
  w.factory("EXPR::f1('0.3 + exp(-(x-5)**2/5.)',x)")
  w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x)")
  #w.factory("EXPR::f2('exp(-(x-2)**2/2)',x)")
  w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1]))
  w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1]))
  
  # Check Model
  w.Print()
  w.writeToFile('{0}/workspace_DecomposingTestOfMixtureModelsClassifiers.root'.format(dir))
  if verbose_printing == True:
    printFrame(w,['x'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2']
    ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)',
    print_pdf=True)
    printFrame(w,['x'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'],
    dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)',print_pdf=True)
    printFrame(w,['x'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'],
    dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)',
    print_pdf=True)
def makeModel(
    workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', 
    dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp',
    verbose_printing=False):
  '''
  RooFit statistical model for the data
  
  '''  
  # Statistical model
  w = ROOT.RooWorkspace('w')
  #w.factory("EXPR::f1('cos(x)**2 + .01',x)")
  w.factory("EXPR::f0('exp(-(x-2.5)**2/1.)',x[0,10])")
  w.factory("EXPR::f1('exp(-(x-5.5)**2/5.)',x)")
  #w.factory("SUM::f2(c1[0.5]*f0,c2[0.5]*f1)")
  
  # Check Model
  w.Print()
  w.writeToFile('{0}/{1}'.format(dir,workspace))
  if verbose_printing == True:
    printFrame(w,['x'],[w.pdf('f0'),w.pdf('f1')],'transfered',['gaussian','transfered']
    ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)',
    print_pdf=True)
def fit(input_workspace,dir,model_g='mlp',c1_g='breast',data_file='data',
      model_file='train',verbose_printing=True):

  bins = 80
  low = 0.
  high = 1.  
  
  if input_workspace <> None:
    f = ROOT.TFile('{0}/{1}'.format(dir,input_workspace))
    w = f.Get('w')
    # TODO test this when workspace is present
    w = ROOT.RooWorkspace('w') if w == None else w
    f.Close()
  else: 
    w = ROOT.RooWorkspace('w')
  w.Print()

  print 'Generating Score Histograms'

  w.factory('score[{0},{1}]'.format(low,high))
  s = w.var('score')
  
  def saveHisto(w,outputs,s,bins,low,high,k='F0',j='F1'):
    
    print 'Estimating {0} {1}'.format(k,j)
    for l,name in enumerate(['sig','bkg']):
      data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data",
          ROOT.RooArgSet(s))
      hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high)
      values = outputs[l]
      #values = values[self.findOutliers(values)]
      for val in values:
        hist.Fill(val)
        s.setVal(val)
        data.add(ROOT.RooArgSet(s))
      norm = 1./hist.Integral()
      hist.Scale(norm) 
        
      s.setBins(bins)
      datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist',
            ROOT.RooArgList(s),hist)
      histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist',
            ROOT.RooArgSet(s), datahist, 1)

      getattr(w,'import')(hist)
      getattr(w,'import')(data)
      getattr(w,'import')(datahist) # work around for morph = w.import(morph)
      getattr(w,'import')(histpdf) # work around for morph = w.import(morph)
      score_str = 'score'
      # Calculate the density of the classifier output using kernel density 
      #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str))

  # Full model
  data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) 
  traindata = data[:,:-1]
  targetdata = data[:,-1]

  numtrain = traindata.shape[0]       
  size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1

  outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/transfer_learning/{0}_F0_F1.pkl'.format(model_file),traindata[targetdata==1],model_g=model_g),
            predict('/afs/cern.ch/work/j/jpavezse/private/transfer_learning/{0}_F0_F1.pkl'.format(model_file),traindata[targetdata==0],model_g=model_g)]

  saveHisto(w,outputs,s, bins, low, high)

  if verbose_printing == True:
    printFrame(w,['score'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=dir,c1_g=c1_g,model_g=model_g),['signal','bkg'],
  dir=dir,model_g=model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions')
 
  w.writeToFile('{0}/{1}'.format(dir,input_workspace))
  w.Print()
def computeRatios(workspace,data_file,model_file,dir,model_g,c1_g,true_dist=False,
      vars_g=None):
  '''
    Use the computed score densities to compute 
    the ratio test.
 
  '''

  f = ROOT.TFile('{0}/{1}'.format(dir,workspace))
  w = f.Get('w')
  f.Close()
  

  print 'Calculating ratios'

  npoints = 50

  score = ROOT.RooArgSet(w.var('score'))
  getRatio = singleRatio

  if true_dist == True:
    vars = ROOT.TList()
    for var in vars_g:
      vars.Add(w.var(var))
    x = ROOT.RooArgSet(vars)

  # NN trained on complete model
  F0pdf = w.function('bkghistpdf_F0_F1')
  F1pdf = w.function('sighistpdf_F0_F1')
  data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) 
  testdata = data[:,:-1]
  testtarget = data[:,-1]

  '''
  # Make ratio considering tumor size unknown
  ts_idx = 2
  target = testdata[0]
  testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()])
  '''

  if true_dist == True and len(vars_g) == 1:
      xarray = np.linspace(1,10,npoints)
      # TODO: Harcoded dist names
      F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray])
      F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray])
      trueRatio = getRatio(F1dist, F0dist)

      outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g)

      F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs])
      F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs])

      completeRatio = getRatio(F0fulldist,F1fulldist)

      saveFig(xarray, [completeRatio, trueRatio], makePlotName('all','train',type='ratio'),title='Density Ratios',labels=['Trained', 'Truth'], print_pdf=True,dir=dir)
  
  outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),testdata,model_g=model_g)

  F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs])
  F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs])

  completeRatio = getRatio(F1fulldist,F0fulldist)
  complete_target = testtarget
  #Histogram F0-f0 for composed, full and true

  # Removing outliers
  numtest = completeRatio.shape[0]
  #decomposedRatio[decomposedRatio < 0.] = completeRatio[decomposedRatio < 0.]

  complete_outliers = np.zeros(numtest,dtype=bool)
  complete_outliers = findOutliers(completeRatio)
  complete_target = testtarget[complete_outliers] 
  completeRatio = completeRatio[complete_outliers]

  bins = 70
  low = 0.6
  high = 1.2

  for l,name in enumerate(['sig','bkg']):
    minimum = completeRatio[complete_target == 1-l].min() 
    maximum = completeRatio[complete_target == 1-l].max()

    low = minimum - ((maximum - minimum) / bins)*10
    high = maximum + ((maximum - minimum) / bins)*10
    w.factory('ratio{0}[{1},{2}]'.format(name, low, high))
    ratio_var = w.var('ratio{0}'.format(name))

    numtest = completeRatio.shape[0] 
    hist = ROOT.TH1F('{0}hist_F0_f0'.format(name),'hist',bins,low,high)
    for val in completeRatio[complete_target == 1-l]:
      hist.Fill(val)
    datahist = ROOT.RooDataHist('{0}datahist_F0_f0'.format(name),'hist',
          ROOT.RooArgList(ratio_var),hist)
    ratio_var.setBins(bins)
    histpdf = ROOT.RooHistFunc('{0}histpdf_F0_f0'.format(name),'hist',
          ROOT.RooArgSet(ratio_var), datahist, 0)

    histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator')
    getattr(w,'import')(hist)
    getattr(w,'import')(datahist) # work around for morph = w.import(morph)
    getattr(w,'import')(histpdf) # work around for morph = w.import(morph)
    #print '{0} {1} {2}'.format(curr,name,hist.Integral())

    if name == 'bkg':
      all_ratios_plots = [w.function('sighistpdf_F0_f0'),
            w.function('bkghistpdf_F0_f0')]
      all_names_plots = ['sig','bkg']
    
  printFrame(w,['ratiosig','ratiobkg'],all_ratios_plots, makePlotName('ratio','comparison',type='hist',dir=dir,model_g=model_g,c1_g=c1_g),all_names_plots,dir=dir,model_g=model_g,y_text='Count',title='Histograms for ratios',x_text='ratio value',print_pdf=True)

  #completeRatio = np.log(completeRatio)
  completeRatio = completeRatio + np.abs(completeRatio.min())
  ratios_list = completeRatio / completeRatio.max()
  legends_list = ['composed','full']
  makeSigBkg([ratios_list],[complete_target],makePlotName('comp','all',type='sigbkg',dir=dir,model_g=model_g,c1_g=c1_g),dir=dir,model_g=model_g,print_pdf=True,legends=legends_list,title='Signal-Background rejection curves')

  # Make transfer learning

  data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) 
  # Transforming f1 into f0
  data_f1 = data[data[:,-1] == 0.]
  data_f0 = data[data[:,-1] == 1.]
  testdata = data_f1[:,:-1]
  testtarget = data_f1[:,-1]

  '''
  # Make ratio considering tumor size unknown
  ts_idx = 2
  target = testdata[0]
  testdata_size = np.array([x for x in testdata if (np.delete(x,ts_idx) == np.delete(target,ts_idx)).all()])
  pdb.set_trace()
  '''

  xarray = testdata

  outputs = predict('{0}/{1}_F0_F1.pkl'.format(dir,model_file),xarray,model_g=model_g)

  F1fulldist = np.array([evalDist(score,F1pdf,[xs]) for xs in outputs])
  F0fulldist = np.array([evalDist(score,F0pdf,[xs]) for xs in outputs])

  completeRatio = getRatio(F0fulldist,F1fulldist)

  if len(vars_g) == 1:
    F1dist = np.array([evalDist(x,w.pdf('f1'),[xs]) for xs in xarray])
    F0dist = np.array([evalDist(x,w.pdf('f0'),[xs]) for xs in xarray])
  else:
    F1dist = np.array([evalDist(x,w.pdf('f1'),xs) for xs in xarray])
    F0dist = np.array([evalDist(x,w.pdf('f0'),xs) for xs in xarray])

  trueRatio = getRatio(F1dist, F0dist)

  trueIndexes = findOutliers(trueRatio)
  completeIndexes = findOutliers(completeRatio)
  #indexes = np.logical_and(trueIndexes,completeIndexes)
  indexes = completeIndexes
  data_f1_red = data_f1
  #trueRatio = trueRatio[indexes]
  #completeRatio = completeRatio[indexes]
  #data_f1_red = data_f1[indexes]


  for f in range(10):
    feature = f
    # Transfering distributions
    # Doing histogram manipulation
    fig,ax = plt.subplots()
    colors = ['b-','r-','k-']
    colors_rgb = ['blue','red','black']
    
    hist,bins = np.histogram(data_f1[:,feature],bins=20, range=(0.,10.),density=True)


    hist_transfered,bins_1 = np.histogram(data_f1_red[:,feature],weights=trueRatio,bins=20, range=(0.,10.),density=True)
    hist_transfered_clf,bins_2 = np.histogram(data_f1_red[:,feature],bins=20,weights=completeRatio, range=(0.,10.),density=True)
    hist0,bins0 = np.histogram(data_f0[:,feature], bins=20, range=(0.,10.),density=True)

    #hist, bins =  ax.hist(data_f0[:,0],color=colors_rgb[0],label='true',bins=50,histtype='stepfilled',normed=1, alpha=0.5,range=[0,100]) 

    widths = np.diff(bins)
    #hist_transfered = hist*trueRatio
    #hist_transfered_clf = hist*completeRatio

    ax.bar(bins[:-1], hist0,widths,label='f0',alpha=0.5,color='red')
    #ax.bar(bins[:-1], hist_transfered,widths,label='f1 transfered (true)',
    #    alpha=0.5,color='blue')
    ax.bar(bins[:-1], hist_transfered_clf,widths,label='f1 transfered (trained)',
        alpha=0.5,color='green')

    ax.legend(frameon=False,fontsize=11)
    ax.set_xlabel('x') 
    ax.set_ylabel('p(x)') 
    if len(vars_g) > 1:
      ax.set_title('Transfered distributions feature {0}'.format(feature))
    else:
      ax.set_title('Transfered distributions')
    file_plot =  makePlotName('all','transf',type='hist_v{0}'.format(feature),model_g=model_g) 
    fig.savefig('{0}/plots/{1}/{2}.png'.format(dir,model_g,file_plot))
Ejemplo n.º 7
0
  def fit(self, data_file='test',importance_sampling=False, true_dist=True,vars_g=None):
    ''' 
      Create pdfs for the classifier 
      score to be used later on the ratio 
      test, input workspace only needed in case 
      there exist true pdfs for the distributions
      the models being used are ./model/{model_g}/{c1_g}/{model_file}_i_j.pkl
      and the data files are ./data/{model_g}/{c1_g}/{data_file}_i_j.dat
    '''

    bins = 40
    low = 0.
    high = 1.  
    
    if self.input_workspace <> None:
      #f = ROOT.TFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace))
      f = ROOT.TFile('{0}/{1}'.format(self.dir,self.workspace))
     
      w = f.Get('w')
      # TODO test this when workspace is present
      w = ROOT.RooWorkspace('w') if w == None else w
      f.Close()
    else: 
      w = ROOT.RooWorkspace('w')
    w.Print()

    print 'Generating Score Histograms'

    w.factory('score[{0},{1}]'.format(low,high))
    s = w.var('score')
    
    if importance_sampling == True:
      if true_dist == True:
        vars = ROOT.TList()
        for var in vars_g:
          vars.Add(w.var(var))
        x = ROOT.RooArgSet(vars)
      else:
        x = None

    #This is because most of the data of the full model concentrate around 0 
    bins_full = 40
    low_full = 0.
    high_full = 1.
    w.factory('scoref[{0},{1}]'.format(low_full, high_full))
    s_full = w.var('scoref')
    histos = []
    histos_names = []
    inv_histos = []
    inv_histos_names = []
    sums_histos = []
    def saveHistos(w,outputs,s,bins,low,high,pos=None,importance_sampling=False,importance_data=None,
          importance_outputs=None):
      if pos <> None:
        k,j = pos
      else:
        k,j = ('F0','F1')
      print 'Estimating {0} {1}'.format(k,j)
      for l,name in enumerate(['sig','bkg']):
        data = ROOT.RooDataSet('{0}data_{1}_{2}'.format(name,k,j),"data",
            ROOT.RooArgSet(s))
        hist = ROOT.TH1F('{0}hist_{1}_{2}'.format(name,k,j),'hist',bins,low,high)
        values = outputs[l]
        #values = values[self.findOutliers(values)]
        for val in values:
          hist.Fill(val)
          s.setVal(val)
          data.add(ROOT.RooArgSet(s))
        norm = 1./hist.Integral()
        hist.Scale(norm) 
          
        s.setBins(bins)
        datahist = ROOT.RooDataHist('{0}datahist_{1}_{2}'.format(name,k,j),'hist',
              ROOT.RooArgList(s),hist)
        #histpdf = ROOT.RooHistPdf('{0}histpdf_{1}_{2}'.format(name,k,j),'hist',
        #      ROOT.RooArgSet(s), datahist, 1)
        histpdf = ROOT.RooHistFunc('{0}histpdf_{1}_{2}'.format(name,k,j),'hist',
              ROOT.RooArgSet(s), datahist, 1)
        #histpdf.setUnitNorm(True)
        #testvalues = np.array([self.evalDist(ROOT.RooArgSet(s), histpdf, [xs]) for xs in values])

        #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooBinIntegrator')

        #print 'INTEGRAL'
        #print histpdf.createIntegral(ROOT.RooArgSet(s)).getVal()
        #print histpdf.Integral()
      
        #histpdf.specialIntegratorConfig(ROOT.kTRUE).method1D().setLabel('RooAdaptiveGaussKronrodIntegrator1D')

        getattr(w,'import')(hist)
        getattr(w,'import')(data)
        getattr(w,'import')(datahist) # work around for morph = w.import(morph)
        getattr(w,'import')(histpdf) # work around for morph = w.import(morph)
        score_str = 'scoref' if pos == None else 'score'
        # Calculate the density of the classifier output using kernel density 
        #w.factory('KeysPdf::{0}dist_{1}_{2}({3},{0}data_{1}_{2},RooKeysPdf::NoMirror,2)'.format(name,k,j,score_str))

        # Print histograms pdfs and estimated densities
        if self.verbose_printing == True and name == 'bkg' and k <> j:
          full = 'full' if pos == None else 'dec'
          if k < j and k <> 'F0':
            histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))])
            histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)])
          if j < k and k <> 'F0':
            inv_histos.append([w.function('sighistpdf_{0}_{1}'.format(k,j)), w.function('bkghistpdf_{0}_{1}'.format(k,j))])
            inv_histos_names.append(['f{0}-f{1}_f{1}(signal)'.format(k,j), 'f{0}-f{1}_f{0}(background)'.format(k,j)])

    if self.scaler == None:
      self.scaler = {}

    # change this
    for k in range(self.nsamples):
      for j in range(self.nsamples):
        if k == j:
          continue
        #if k <> 2 and j <> 2:
        #  continue
        if self.dataset_names <> None:
          name_k, name_j = (self.dataset_names[k], self.dataset_names[j])
        else:
          name_k, name_j = (k,j)
        print 'Loading {0}:{1} {2}:{3}'.format(k,name_k, j,name_j)
        traindata, targetdata = loadData(data_file,name_k,name_j,dir=self.dir,c1_g=self.c1_g,
            preprocessing=self.preprocessing,scaler=self.scaler,persist=True)
       
        numtrain = traindata.shape[0]       
        size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1
        #output = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 1],model_g=self.model_g),
        #  predict('/afs/cern.ch/work/j/jpavezse/private/{0}_{1}_{2}.pkl'.format(self.model_file,k,j),traindata[targetdata == 0],model_g=self.model_g)]
        output = [predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==1],model_g=self.model_g,clf=self.clf),
              predict('{0}/model/{1}/{2}/{3}_{4}_{5}.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file,k,j),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)]
        saveHistos(w,output,s,bins,low,high,(k,j))
        #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace))
        w.writeToFile('{0}/{1}'.format(self.dir,self.workspace))

    if self.verbose_printing==True:
      for ind in range(1,(len(histos)/3+1)):
        print_histos = histos[(ind-1)*3:(ind-1)*3+3]
        print_histos_names = histos_names[(ind-1)*3:(ind-1)*3+3]
        printMultiFrame(w,['score']*len(print_histos),print_histos, makePlotName('dec{0}'.format(ind-1),'all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),print_histos_names,
          dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions')
    # Full model
    traindata, targetdata = loadData(data_file,self.F0_dist,self.F1_dist,dir=self.dir,c1_g=self.c1_g,
      preprocessing=self.preprocessing, scaler=self.scaler)
    numtrain = traindata.shape[0]       
    size2 = traindata.shape[1] if len(traindata.shape) > 1 else 1
    outputs = [predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==1],model_g=self.model_g,clf=self.clf),
              predict('{0}/model/{1}/{2}/{3}_F0_F1.pkl'.format(self.dir,self.model_g,self.c1_g,self.model_file),traindata[targetdata==0],model_g=self.model_g,clf=self.clf)]
    #outputs = [predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==1],model_g=self.model_g),
    #          predict('/afs/cern.ch/work/j/jpavezse/private/{0}_F0_F1.pkl'.format(self.model_file),traindata[targetdata==0],model_g=self.model_g)]

    saveHistos(w,outputs,s_full, bins_full, low_full, high_full,importance_sampling=False)
    if self.verbose_printing == True:
      printFrame(w,['scoref'],[w.function('sighistpdf_F0_F1'),w.function('bkghistpdf_F0_F1')], makePlotName('full','all',type='hist',dir=self.dir,c1_g=self.c1_g,model_g=self.model_g),['signal','bkg'],
    dir=self.dir,model_g=self.model_g,y_text='score(x)',print_pdf=True,title='Pairwise score distributions')
   
    #w.writeToFile('{0}/{1}'.format('/afs/cern.ch/work/j/jpavezse/private',self.workspace))
    w.writeToFile('{0}/{1}'.format(self.dir,self.workspace))
    
    w.Print()
Ejemplo n.º 8
0
def makeModelPrivateND(vars_g,c0, c1, n_private=3, coeffs=coeffs_g,cov_l=cov_g, mu_l=mu_g, 
    workspace='workspace_DecomposingTestOfMixtureModelsClassifiers.root', 
    dir='/afs/cern.ch/user/j/jpavezse/systematics',model_g='mlp',
    c1_g='',verbose_printing=False,load_cov=False):
  '''
  RooFit statistical model for the data
  
  '''  
  # Statistical model
  w = ROOT.RooWorkspace('w')

  print 'Generating initial distributions'
  cov_m = []
  mu_m = []
  mu_str = []
  cov_root = []
  vec = []
  argus = ROOT.RooArgList()     

  # features
  for i,var in enumerate(vars_g):
    w.factory('{0}[{1},{2}]'.format(var,-25,30))
    argus.add(w.var(var))
  n = len(cov_l[0][0])
  for glob in range(3):
    for priv in range(n_private):
      if load_cov == False: 
        cov_i = np.random.random((n,n))
        cov_i = cov_i + cov_i.transpose() 
        cov_i = cov_i + n*np.eye(n)
        np.savetxt('{0}/covariance_{1}_{2}.txt'.format(dir,glob,priv),
            cov_i,fmt='%f')
      else:
        cov_i = np.matrix(np.loadtxt('{0}/data/covariance_{1}_{2}.txt'.format(
                    dir,glob,priv)))
      print cov_i
      # generate covriance matrix
      cov_m.append(cov_i)
      cov_root.append(ROOT.TMatrixDSym(len(vars_g)))
      for i,var1 in enumerate(vars_g):
        for j,var2 in enumerate(vars_g):
          if i <= j:
            cov_root[-1][i][j] = cov_m[-1][i,j]
          else:
            cov_root[-1][i][j] = cov_m[-1][j,i]
      getattr(w,'import')(cov_root[-1],'cov{0}'.format(glob*3 + priv))
      # generate mu vectors
      mu_m.append(np.array(mu_l[glob]) + meansum[glob][priv])
      vec.append(ROOT.TVectorD(len(vars_g)))
      for i, mu in enumerate(mu_m[-1]):
        vec[-1][i] = mu
      mu_str.append(','.join([str(mu) for mu in mu_m[-1]]))
      # create multivariate gaussian
      gaussian = ROOT.RooMultiVarGaussian('f{0}_{1}'.format(glob,priv),
            'f{0}_{1}'.format(glob,priv),argus,vec[-1],cov_root[-1])
      getattr(w,'import')(gaussian)
    # create private mixture model
    priv_coeffs = np.array(coeffs[glob])
    #print 'priv coef {0} {1}'.format(priv_coeffs, priv_coeffs.sum())
    sum_str = ','.join(['c_{0}_{1}[{2}]*f{0}_{1}'.format(glob,j,priv_coeffs[j]) for j in range(n_private)])
    w.factory('SUM::f{0}({1})'.format(glob,sum_str))
  #mixture model  
  w.factory("SUM::F0(c00[{0}]*f0,c01[{1}]*f1,f2)".format(c0[0],c0[1]))
  w.factory("SUM::F1(c10[{0}]*f0,c11[{1}]*f1,f2)".format(c1[0],c1[1]))
  
  # Check Model
  w.Print()

  w.writeToFile('{0}/{1}'.format(dir,workspace))
  if verbose_printing == True:
    printFrame(w,['x0','x1','x2'],[w.pdf('f0'),w.pdf('f1'),w.pdf('f2')],'decomposed_model',['f0','f1','f2']
    ,dir=dir,model_g=model_g,range=[-15,20],title='Single distributions',x_text='x0',y_text='p(x)')
    printFrame(w,['x0','x1','x2'],[w.pdf('F0'),w.pdf('F1')],'full_model',['Bkg','Bkg+Signal'],
    dir=dir,model_g=model_g,range=[-15,20],title='Composed model',x_text='x0',y_text='p(x)')
    printFrame(w,['x0','x1','x2'],[w.pdf('F1'),'f0'],'full_signal', ['Bkg','Signal'],
    dir=dir,model_g=model_g,range=[-15,20],title='Background and signal',x_text='x0',y_text='p(x)')

  return w