Beispiel #1
0
  def train(self, n_epochs, n_batch):
    if not self.prepped:
      sum_neg_weights = utils.sum_of_weights_v2(self.features_train.weights, self.features_train.label, 0)
      sum_pos_weights = utils.sum_of_weights_v2(self.features_train.weights, self.features_train.label, 1)
      print(("Sum of weights before scaling: ", sum_pos_weights, sum_neg_weights))

      self.features_train.weights[numpy.where(self.features_train.label == 1)] *= sum_neg_weights / sum_pos_weights 
      #self.features_train.weights[numpy.where(self.features_train.label == 1)] *= 10
      self.prepped = True

      sum_neg_weights = utils.sum_of_weights_v2(self.features_train.weights, self.features_train.label, 0)
      sum_pos_weights = utils.sum_of_weights_v2(self.features_train.weights, self.features_train.label, 1)
      print(("Sum of weights after scaling: ", sum_pos_weights, sum_neg_weights))
      print(("Sum of weights in validation set ", utils.sum_of_weights_v2(self.features_validation.weights, self.features_validation.label, 1), utils.sum_of_weights_v2(self.features_validation.weights, self.features_validation.label, 0)))

    for i in range(n_epochs):
      self.model.fit(self.features_train.features, self.features_train.label, epochs = 1, batch_size = self.batch_size_train, sample_weight = self.features_train.weights, callbacks = self.callbacks)
      self.n_epochs += 1
      self.predict()

      fpr_train, tpr_train, thresh_train = metrics.roc_curve(self.features_train.label, self.predictions["train"], pos_label = 1, sample_weight = self.features_train.weights)
      fpr_validation, tpr_validation, thresh_validation = metrics.roc_curve(self.features_validation.label, self.predictions["validation"], pos_label = 1, sample_weight = self.features_validation.weights)

      auc, auc_unc, blah, blah, blah = utils.auc_and_unc(self.features_validation.label, self.predictions["validation"], self.features_validation.weights, self.n_bootstrap)
      auc_train, auc_unc_train, blah, blah, blah = utils.auc_and_unc(self.features_train.label, self.predictions["train"], self.features_train.weights, self.n_bootstrap)

      print(("Test   AUC: %.4f +/- %.4f" % (auc, auc_unc)))
      print(("Train  AUC: %.4f +/- %.4f" % (auc_train, auc_unc_train)))

      self.tpr["validation"].append(tpr_validation)
      self.tpr["train"].append(tpr_train)
      self.fpr["validation"].append(fpr_validation)
      self.fpr["train"].append(fpr_train)

      self.auc["validation"].append(auc)
      self.auc_unc["validation"].append(auc_unc)
      self.auc["train"].append(auc_train)
      self.auc_unc["train"].append(auc_unc_train)

      self.model.save_weights("dnn_weights/" + self.tag + "_weights_%d.hdf5" % i)
      with open("dnn_weights/" + self.tag + "_model_architecture_%d.json" % i, "w") as f_out:
          f_out.write(self.model.to_json())

    rocs = { "fpr_train" : fpr_train, "tpr_train" : tpr_train, "thresh_train" : thresh_train, "fpr_validation" : fpr_validation, "tpr_validation" : tpr_validation, "thresh_validation" : thresh_validation }
    return auc_train, auc, rocs
Beispiel #2
0
    selection=
    "train_id == 1 && ((process_id == 0 && signal_mass_label == 0) || (process_id == 11 || process_id == 12) || (process_id >= 22))"
)
data_events = root_numpy.tree2array(tree,
                                    branches=feature_names,
                                    selection="process_id == 10")

y_test = validation_events["sample_id"]
pred_bdt = validation_events["mva_score"]
pred_dnn = validation_events["fcnc_vs_smhiggs_dnn"]
weights_test = validation_events["weight"]
tth_mva = validation_events["tth_runII_mva"]

print numpy.sum(weights_test[numpy.where(y_test == 0)])
print numpy.sum(weights_test[numpy.where(y_test == 1)])

auc_bdt, unc_bdt, fpr_bdt, tpr_bdt, thresh_bdt = utils.auc_and_unc(
    y_test, pred_bdt, weights_test, 25)
auc_dnn, unc_dnn, fpr_dnn, tpr_dnn, thresh_dnn = utils.auc_and_unc(
    y_test, pred_dnn, weights_test, 25)

print(auc_bdt)
print(auc_dnn)

#reference = numpy.load(args.reference)

#for i in range(len(data_events["mass"])):
#    for j in range(len(reference["mass_data"])):
#        if data_events["mass"][i] == reference["mass_data"][j]:
#            print "DNN score from tree: %.4f, from ref: %.4f" % (data_events["fcnc_vs_smhiggs_dnn"][i], reference["scores_data"][j])
Beispiel #3
0
def train_bdt(config):
    # Trains BDT with given hyperparams and returns max Z_A (as calculated on bkg MC), requiring at least 4 signal events
    if config["invert_test_and_train"]:
        config["input_file"] = config["input_file_2"]
    else:
        config["input_file"] = config["input_file_1"]
    f = h5py.File(config["input_file"], "r")

    feature_names = utils.load_array(f, 'feature_names')
    training_feature_names = utils.load_array(f, 'training_feature_names')

    print(("Training with the following features: ", training_feature_names))

    #if config["invert_test_and_train"]:
    #print "Inverting test and train splits"
    #if config["sideband"]:
    #  print "Not yet implemented how to handle inverting the test/train set when training on data sidebands, exiting"
    #  return -1

    #global_features = utils.load_array(f, 'global_validation')
    #label = utils.load_array(f, 'label_validation')
    #multi_label = utils.load_array(f, 'multi_label_validation')
    #weights = utils.load_array(f, 'weights_validation')
    #mass = utils.load_array(f, 'mass_validation')

    #global_features_validation = utils.load_array(f, 'global')
    #label_validation = utils.load_array(f, 'label')
    #multi_label_validation = utils.load_array(f, 'multi_label')
    #weights_validation = utils.load_array(f, 'weights')
    #mass_validation = utils.load_array(f, 'mass')

    #else:
    global_features = utils.load_array(f, 'global')
    label = utils.load_array(f, 'label')
    multi_label = utils.load_array(f, 'multi_label')
    weights = utils.load_array(f, 'weights')
    mass = utils.load_array(f, 'mass')

    global_features_validation = utils.load_array(f, 'global_validation')
    label_validation = utils.load_array(f, 'label_validation')
    multi_label_validation = utils.load_array(f, 'multi_label_validation')
    weights_validation = utils.load_array(f, 'weights_validation')
    mass_validation = utils.load_array(f, 'mass_validation')

    if config["sideband"]:
        global_features = utils.load_array(f, 'global_data_sideband')
        label = utils.load_array(f, 'label_data_sideband')
        multi_label = utils.load_array(f, 'multi_label_data_sideband')
        weights = utils.load_array(f, 'weights_data_sideband')
        mass = utils.load_array(f, 'mass_data_sideband')

    global_features_data = utils.load_array(f, 'global_data')
    label_data = utils.load_array(f, 'label_data')
    multi_label_data = utils.load_array(f, 'multi_label_data')
    weights_data = utils.load_array(f, 'weights_data')
    mass_data = utils.load_array(f, 'mass_data')

    print((global_features.shape))
    print((label.shape))
    print((weights.shape))

    print((global_features_validation.shape))
    print((label_validation.shape))
    print((weights_validation.shape))

    print((global_features_data.shape))
    print((label_data.shape))
    print((weights_data.shape))

    x_train, y_train, y_train_multi, weights_train = global_features, label, multi_label, weights
    x_test, y_test, y_test_multi, weights_test = global_features_validation, label_validation, multi_label_validation, weights_validation

    X_train = pandas.DataFrame(data=x_train, columns=training_feature_names)
    X_test = pandas.DataFrame(data=x_test, columns=training_feature_names)
    X_data = pandas.DataFrame(data=global_features_data,
                              columns=training_feature_names)

    if config["multiclassifier"]:
        Y_train = y_train_multi
        Y_test = y_test_multi
    else:
        Y_train = y_train
        Y_test = y_test

    sum_neg_weights = utils.sum_of_weights_v2(weights_train, label, 0)
    sum_pos_weights = utils.sum_of_weights_v2(weights_train, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    d_train = xgboost.DMatrix(X_train, label=Y_train, weight=weights_train)
    d_test = xgboost.DMatrix(X_test, label=Y_test)
    d_data = xgboost.DMatrix(X_data)

    param = {
        'max_depth': config["max_depth"],
        'eta': config["eta"],
        'subsample': config["subsample"],
        'colsample_bytree': config["colsample_bytree"],
        'min_child_weight': config["min_child_weight"],
        'gamma': config["gamma"],
        'reg_alpha': config["reg_alpha"],
        'reg_lambda': config["reg_lambda"],
        'scale_pos_weight': sum_neg_weights / sum_pos_weights,
        'objective': 'binary:logistic',
        'nthread': 16,
    }

    if config["multiclassifier"]:
        param["num_class"] = config["n_class"]
        param["objective"] = "multi:softprob"
        param["scale_pos_weight"] = 1

    evallist = [(d_train, 'train'), (d_test, 'test')]
    progress = {}

    n_round = config["n_round"]
    print((param, n_round))

    # train
    bdt = xgboost.train(param,
                        d_train,
                        n_round,
                        evallist,
                        evals_result=progress)

    bdt.save_model(config["tag"] + "_bdt.xgb")
    model = bdt.get_dump()

    input_variables = []
    for name in feature_names:
        input_variables.append((name, 'F'))
    #tmva_utils.convert_model(model, input_variables = input_variables, output_xml = config["tag"] + '_bdt.xml')

    # predict
    pred_train = bdt.predict(d_train, output_margin=config["multiclassifier"])
    pred_test = bdt.predict(d_test, output_margin=config["multiclassifier"])
    pred_data = bdt.predict(d_data, output_margin=config["multiclassifier"])

    fpr_train, tpr_train, thresh_train = metrics.roc_curve(
        y_train, pred_train, pos_label=1, sample_weight=weights_train)
    fpr_test, tpr_test, thresh_test = metrics.roc_curve(
        y_test, pred_test, pos_label=1, sample_weight=weights_test)

    auc_train, auc_train_unc = utils.auc_and_unc(y_train, pred_train,
                                                 weights_train, 100)
    auc_test, auc_test_unc = utils.auc_and_unc(y_test, pred_test, weights_test,
                                               100)

    #auc_train = metrics.auc(fpr_train, tpr_train, reorder = True)
    #auc_test  = metrics.auc(fpr_test , tpr_test , reorder = True)

    print(("Training AUC: %.3f" % auc_train))
    print(("Testing  AUC: %.3f" % auc_test))

    # estimate z_a w/at least 4 signal events
    n_quantiles = 25
    signal_mva_scores = {
        "bdt_score": ks_test.logical_vector(pred_test, y_test, 1)
    }
    bkg_mva_scores = {
        "bdt_score": ks_test.logical_vector(pred_test, y_test, 0)
    }
    data_mva_scores = {"bdt_score": pred_data}

    signal_mass = ks_test.logical_vector(mass_validation, y_test, 1)
    bkg_mass = ks_test.logical_vector(mass_validation, y_test, 0)

    signal_weights = ks_test.logical_vector(weights_validation, y_test, 1)
    bkg_weights = ks_test.logical_vector(weights_validation, y_test, 0)

    optimization_vars = config["optimization_vars"].split(
        ",") if config["optimization_vars"] else []
    for var in optimization_vars:
        signal_mva_scores[var] = ks_test.logical_vector(
            utils.load_array(f, var + '_validation'), y_test, 1)
        bkg_mva_scores[var] = ks_test.logical_vector(
            utils.load_array(f, var + '_validation'), y_test, 0)
        data_mva_scores[var] = utils.load_array(f, var + '_data')

    signal_events = {
        "mass": signal_mass,
        "weights": signal_weights,
        "mva_score": signal_mva_scores
    }
    bkg_events = {
        "mass": bkg_mass,
        "weights": bkg_weights,
        "mva_score": bkg_mva_scores
    }
    data_events = {
        "mass": mass_data,
        "weights": weights_data,
        "mva_score": data_mva_scores
    }

    za, za_unc, s, b, sigma_eff = significance_utils.za_scores(
        n_quantiles, signal_events, bkg_events, False)
    za_data, za_unc_data, s_data, b_data, sigma_eff_data = significance_utils.za_scores(
        n_quantiles, signal_events, data_events, True)
    za = numpy.asarray(za)
    za_data = numpy.asarray(za_data)

    if numpy.all(za == 0) or numpy.all(za_data == 0):
        return 0.0, 0.0, 0.0, 0.0

    max_za_mc = numpy.max(za[numpy.where(numpy.asarray(s) >= 4.)])
    max_za_data = numpy.max(za_data[numpy.where(numpy.asarray(s_data) >= 4.)])

    max_za_mc, max_za_mc_idx = utils.find_nearest(za, max_za_mc)
    max_za_data, max_za_data_idx = utils.find_nearest(za_data, max_za_data)

    max_za_mc_unc = za_unc[max_za_mc_idx]
    max_za_data_unc = za_unc_data[max_za_data_idx]

    print(("Max Z_A on MC:   %.4f +/- %.4f" % (max_za_mc, max_za_mc_unc)))
    print(("Max Z_A on data: %.4f +/- %.4f" % (max_za_data, max_za_data_unc)))

    return max_za_mc, max_za_mc_unc, max_za_data, max_za_data_unc, auc_train, auc_train_unc, auc_test, auc_test_unc
Beispiel #4
0
  def train_with_early_stopping(self):
    best_auc = 0.5
    keep_training = True

    max_batch_size = 10000
    epochs = 1
    bad_epochs = 0
    while keep_training:
      auc_train, auc, rocs = self.train(epochs, self.batch_size_train)
      improvement = ((1-best_auc)-(1-auc))/(1-best_auc)
      overfit = (auc_train - auc) / auc_train
      if improvement > 0.01:
          print(("Improvement in (1-AUC) of %.3f percent! Keeping batch size the same" % (improvement*100.)))
          best_auc = auc
          bad_epochs = 0
      elif self.batch_size_train * 4 < max_batch_size:
          print(("Improvement in (1-AUC) of %.3f percent. Increasing batch size" % (improvement*100.)))
          self.batch_size_train *= 4
          bad_epochs = 0
          if auc > best_auc:
              best_auc = auc
      elif self.batch_size_train < max_batch_size:
          print(("Improvement in (1-AUC) of %.3f percent. Increasing batch size" % (improvement*100.)))
          self.batch_size_train = max_batch_size
          bad_epochs = 0
          if auc > best_auc:
              best_auc = auc 
      elif improvement > 0:
          print(("Improvement in (1-AUC) of %.3f percent. Can't increase batch size anymore" % (improvement*100.))) 
          bad_epochs = 0
          best_auc = auc
      #elif improvement < 0 and overfit < 0.01 and bad_epochs < 3:
      #    print (("Overfitting by less than 1%, continue training"))
      #    bad_epochs += 1
      else:
          print("AUC did not improve and we can't increase batch size anymore. Stopping training.")
          keep_training = False
      if self.n_epochs >= self.max_epochs:
          print("Have already trained for 25 epochs. Stopping training.")
          keep_training = False
      if self.curriculum_learn:
          value, idx = utils.find_nearest(rocs["tpr_train"], 0.90)
          cut = rocs["thresh_train"][idx]
          good_indices = numpy.where(self.predictions["train"] > cut)
          self.features_train.features[0] = self.features_train.features[0][good_indices]
          self.features_train.features[1] = self.features_train.features[1][good_indices]
          self.features_train.global_features = self.features_train.global_features[good_indices]
          self.features_train.objects = self.features_train.objects[good_indices]
          self.features_train.label = self.features_train.label[good_indices]
          self.features_train.weights = self.features_train.weights[good_indices]
          self.prepped = False


    auc, auc_unc, fpr, tpr, thresh = utils.auc_and_unc(self.features_validation.label, self.predictions["validation"], self.features_validation.weights, 50)
    auc_train, auc_unc_train, fpr_train, tpr_train, threshd_train = utils.auc_and_unc(self.features_train.label, self.predictions["train"], self.features_train.weights, 50)
    self.auc_unc["validation"] = auc_unc
    self.auc_unc["train"] = auc_unc_train

    self.model.save_weights("dnn_weights/" + self.tag + "_weights.hdf5")
    with open("dnn_weights/" + self.tag + "_model_architecture.json", "w") as f_out:
      f_out.write(self.model.to_json())

    return
Beispiel #5
0
def train_bdt(config, invert=False):
    results = {}

    args = config["args"]
    ### Read features ###
    if not invert:
        f = h5py.File(args.input.replace(".hdf5", "") + ".hdf5", "r")
    else:
        f = h5py.File(args.input_invert.replace(".hdf5", "") + ".hdf5", "r")

    feature_names = utils.load_array(f, 'feature_names')
    training_feature_names = utils.load_array(f, 'training_feature_names')

    print(training_feature_names)

    global_features = utils.load_array(f, 'global')
    global_dnn_features = utils.load_array(f, 'global_dnn')
    label = utils.load_array(f, 'label')
    multi_label = utils.load_array(f, 'multi_label')
    weights = utils.load_array(f, 'weights')
    mass = utils.load_array(f, 'mass')
    njets = utils.load_array(f, 'njets')
    lead_sigmaEtoE = utils.load_array(f, 'lead_sigmaEtoE')
    sublead_sigmaEtoE = utils.load_array(f, 'sublead_sigmaEtoE')
    signal_mass_label = utils.load_array(f, 'signal_mass_label')
    signal_mass_category = utils.load_array(f, 'signal_mass_category')
    tth_2017_reference_mva = utils.load_array(f, 'tth_2017_reference_mva')
    evt = utils.load_array(f, 'evt')
    run = utils.load_array(f, 'run')
    lumi = utils.load_array(f, 'lumi')
    process_id = utils.load_array(f, 'process_id')
    year = utils.load_array(f, 'year')
    #objects = utils.load_array(f, 'objects')
    tth_runII_mva = utils.load_array(f, 'tth_runII_mva')

    if args.sideband:
        global_features = utils.load_array(f, 'global_data_sideband')
        label = utils.load_array(f, 'label_data_sideband')
        multi_label = utils.load_array(f, 'multi_label_data_sideband')
        weights = utils.load_array(f, 'weights_data_sideband')
        mass = utils.load_array(f, 'mass_data_sideband')
        #lead_sigmaEtoE = utils.load_array(f, 'lead_sigmaEtoE_data_sideband')
        #sublead_sigmaEtoE = utils.load_array(f, 'sublead_sigmaEtoE_data_sideband')

    global_features_validation = utils.load_array(f, 'global_validation')
    global_dnn_features_validation = utils.load_array(f,
                                                      'global_dnn_validation')
    label_validation = utils.load_array(f, 'label_validation')
    multi_label_validation = utils.load_array(f, 'multi_label_validation')
    weights_validation = utils.load_array(f, 'weights_validation')
    mass_validation = utils.load_array(f, 'mass_validation')
    njets_validation = utils.load_array(f, 'njets_validation')
    signal_mass_label_validation = utils.load_array(
        f, 'signal_mass_label_validation')
    signal_mass_category_validation = utils.load_array(
        f, 'signal_mass_category_validation')
    tth_2017_reference_mva_validation = utils.load_array(
        f, 'tth_2017_reference_mva_validation')
    evt_validation = utils.load_array(f, 'evt_validation')
    run_validation = utils.load_array(f, 'run_validation')
    lumi_validation = utils.load_array(f, 'lumi_validation')
    process_id_validation = utils.load_array(f, 'process_id_validation')
    year_validation = utils.load_array(f, 'year_validation')
    #objects_validation = utils.load_array(f, 'objects_validation')
    tth_runII_mva_validation = utils.load_array(f, 'tth_runII_mva_validation')

    global_features_data = utils.load_array(f, 'global_data')
    global_dnn_features_data = utils.load_array(f, 'global_dnn_data')
    label_data = utils.load_array(f, 'label_data')
    multi_label_data = utils.load_array(f, 'multi_label_data')
    weights_data = utils.load_array(f, 'weights_data')
    mass_data = utils.load_array(f, 'mass_data')
    njets_data = utils.load_array(f, 'njets_data')
    signal_mass_label_data = utils.load_array(f, 'signal_mass_label_data')
    signal_mass_category_data = utils.load_array(f,
                                                 'signal_mass_category_data')
    tth_2017_reference_mva_data = utils.load_array(
        f, 'tth_2017_reference_mva_data')
    evt_data = utils.load_array(f, 'evt_data')
    run_data = utils.load_array(f, 'run_data')
    lumi_data = utils.load_array(f, 'lumi_data')
    process_id_data = utils.load_array(f, 'process_id_data')
    year_data = utils.load_array(f, 'year_data')
    #objects_data = utils.load_array(f, 'objects_data')
    tth_runII_mva_data = utils.load_array(f, 'tth_runII_mva_data')

    global_features_final_fit = utils.load_array(f, 'global_final_fit')
    global_dnn_features_final_fit = utils.load_array(f, 'global_dnn_final_fit')
    label_final_fit = utils.load_array(f, 'label_final_fit')
    multi_label_final_fit = utils.load_array(f, 'multi_label_final_fit')
    weights_final_fit = utils.load_array(f, 'weights_final_fit')
    mass_final_fit = utils.load_array(f, 'mass_final_fit')
    njets_final_fit = utils.load_array(f, 'njets_final_fit')
    signal_mass_label_final_fit = utils.load_array(
        f, 'signal_mass_label_final_fit')
    signal_mass_category_final_fit = utils.load_array(
        f, 'signal_mass_category_final_fit')
    tth_2017_reference_mva_final_fit = utils.load_array(
        f, 'tth_2017_reference_mva_final_fit')
    evt_final_fit = utils.load_array(f, 'evt_final_fit')
    run_final_fit = utils.load_array(f, 'run_final_fit')
    lumi_final_fit = utils.load_array(f, 'lumi_final_fit')
    process_id_final_fit = utils.load_array(f, 'process_id_final_fit')
    year_final_fit = utils.load_array(f, 'year_final_fit')
    #objects_final_fit = utils.load_array(f, 'objects_final_fit')
    tth_runII_mva_final_fit = utils.load_array(f, 'tth_runII_mva_final_fit')

    print global_dnn_features.shape, global_dnn_features_validation.shape, global_dnn_features_data.shape, global_dnn_features_final_fit.shape

    num_multi_class = 3  #len(numpy.unique(multi_label, return_index = True))

    train_frac = 1.0  # use this fraction of data for training, use 1-train_frac for testing
    nTrain = int(len(label) * train_frac)

    print((global_features.shape))
    print((label.shape))
    print((weights.shape))

    print((global_features_validation.shape))
    print((label_validation.shape))
    print((weights_validation.shape))

    print((global_features_data.shape))
    print((label_data.shape))
    print((weights_data.shape))

    x_train, y_train, y_train_multi, weights_train = global_features, label, multi_label, weights
    x_test, y_test, y_test_multi, weights_test = global_features_validation, label_validation, multi_label_validation, weights_validation

    X_train = pandas.DataFrame(data=x_train, columns=training_feature_names)
    X_test = pandas.DataFrame(data=x_test, columns=training_feature_names)
    X_data = pandas.DataFrame(data=global_features_data,
                              columns=training_feature_names)
    X_final_fit = pandas.DataFrame(data=global_features_final_fit,
                                   columns=training_feature_names)

    if args.multi:
        Y_train = y_train_multi
        Y_test = y_test_multi
    else:
        Y_train = y_train
        Y_test = y_test

    #unique, count =  numpy.unique(multi_label,return_counts=True)
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 0, 1/(count[0]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 1, 1/(count[1]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 2, 1/(count[2]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 3, 1/(count[3]/float(sum(count))), 1) )
    #weights_train = numpy.multiply(weights_train, numpy.where(multi_label == 4, 1/(count[4]/float(sum(count))), 1) )

    sum_neg_weights = utils.sum_of_weights(weights_train, label, 0)
    sum_pos_weights = utils.sum_of_weights(weights_train, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    scale_tth = False
    if scale_tth:
        for i in range(len(weights_train)):
            if multi_label[i] == 1:
                weights_train[i] *= 6.
    weights_train_modified = weights_train

    equal_weights = args.equal_weights
    if args.multi:
        if not equal_weights:
            for j in range(len(weights_train_modified)):
                if multi_label[j] == 0:
                    weights_train_modified[
                        j] *= sum_neg_weights / sum_pos_weights
        else:
            for i in range(num_multi_class):
                sum_class_weights = utils.sum_of_weights(
                    weights_train_modified, multi_label, i)
                print(("Normalizing class %d by %.6f" %
                       (i, sum_class_weights)))
                for j in range(len(weights_train_modified)):
                    if multi_label[j] == i:
                        weights_train_modified[j] *= 1. / sum_class_weights

    if args.res:
        for i in range(len(weights_train_modified)):
            if label[i] == 1:
                print((
                    weights_train_modified[i], 1 /
                    math.sqrt(lead_sigmaEtoE[i]**2 + sublead_sigmaEtoE[i]**2)))
                weights_train_modified[i] *= 1 / math.sqrt(
                    lead_sigmaEtoE[i]**2 + sublead_sigmaEtoE[i]**2)
                print((weights_train_modified[i]))

    sum_neg_weights = utils.sum_of_weights(weights_train_modified, label, 0)
    sum_pos_weights = utils.sum_of_weights(weights_train_modified, label, 1)

    print((sum_pos_weights, sum_neg_weights))

    d_train = xgboost.DMatrix(X_train,
                              label=Y_train,
                              weight=weights_train_modified)
    d_test = xgboost.DMatrix(X_test, label=Y_test)
    d_data = xgboost.DMatrix(X_data)
    d_final_fit = xgboost.DMatrix(X_final_fit)

    # Define BDT parameters
    if "kparam" not in list(config.keys()):
        param = {
            'max_depth': 4,
            'eta': 0.2,
            'objective': 'binary:logistic',
            'scale_pos_weight': sum_neg_weights / sum_pos_weights,
            'subsample': 1.0,
            'colsample_bytree': 1.0,
            'nthread': 12,
            'min_child_weight': min(
                1, (sum_neg_weights) / 100000.
            ),  # min_child_weight depends on the absolute value of the weights
        }
    else:
        param = config["kparam"]

    if args.multi:
        param["num_class"] = num_multi_class
        param["objective"] = "multi:softprob"
        param["scale_pos_weight"] = 1
        param["min_child_weight"] = 0.000001

    print(param)

    if "n_round" not in list(config.keys()):
        n_round = 300 if args.channel == "Hadronic" else 150
        #n_round = 10
        if "FCNC" in args.input:
            n_round = 150
            if args.multi:
                n_round = 150
            if "SMHiggs" in args.input and args.channel == "Hadronic":
                n_round = 500
    else:
        n_round = config["n_round"]
    evallist = [(d_train, 'train'), (d_test, 'test')]
    progress = {}

    print((param, n_round))

    # train
    bdt = xgboost.train(param,
                        d_train,
                        n_round,
                        evallist,
                        evals_result=progress)

    bdt.save_model(args.channel + "_" + args.tag + "_" + args.ext + "_bdt.xgb")
    model = bdt.get_dump()

    input_variables = []
    for name in feature_names:
        input_variables.append((name, 'F'))
    tmva_utils.convert_model(model,
                             input_variables=input_variables,
                             output_xml=args.channel + "_" + args.tag + "_" +
                             args.ext + '_bdt.xml')

    # predict
    pred_train = bdt.predict(d_train)
    pred_test = bdt.predict(d_test)
    pred_data = bdt.predict(d_data)
    pred_final_fit = bdt.predict(d_final_fit)

    if args.reference_mva != "none":
        if ".xgb" in args.reference_mva:
            ref_mva = xgboost.Booster()
            ref_mva.load_model(args.reference_mva)
            pred_ref_train = ref_mva.predict(d_train, output_margin=args.multi)
            pred_ref_test = ref_mva.predict(d_test, output_margin=args.multi)
            pred_ref_data = ref_mva.predict(d_data, output_margin=args.multi)
            pred_ref_final_fit = ref_mva.predict(d_final_fit,
                                                 output_margin=args.multi)
        elif ".json" in args.reference_mva:
            import dnn_helper
            dnn_features_train = dnn_helper.DNN_Features(
                name='train',
                global_features=global_dnn_features,
                objects=objects)
            dnn_features_validation = dnn_helper.DNN_Features(
                name='validation',
                global_features=global_dnn_features_validation,
                objects=objects_validation)
            dnn_features_data = dnn_helper.DNN_Features(
                name='data',
                global_features=global_dnn_features_data,
                objects=objects_data)
            dnn_features_final_fit = dnn_helper.DNN_Features(
                name='final_fit',
                global_features=global_dnn_features_final_fit,
                objects=objects_final_fit)
            with open(args.reference_mva, "r") as f_in:
                metadata = json.load(f_in)
            dnn = dnn_helper.DNN_Helper(
                features_validation=dnn_features_validation,
                features_train=dnn_features_train,
                features_data=dnn_features_data,
                features_final_fit=dnn_features_final_fit,
                metadata=metadata,
                weights_file="dnn_weights/" + metadata["weights"],
                train_mode=False)
            dnn.predict(debug=True)
            pred_ref_train = dnn.predictions["train"]
            pred_ref_test = dnn.predictions["validation"]
            pred_ref_data = dnn.predictions["data"]
            pred_ref_final_fit = dnn.predictions["final_fit"]

    print((pred_test.shape))

    #if args.multi:
    #  pred_train = pred_train[:,0]
    #  pred_test = pred_test[:,0]
    #  pred_data = pred_data[:,0]
    #  pred_final_fit = pred_final_fit[:,0]

    print((pred_test.shape))

    # analysis
    # ks test

    if args.multi:
        prediction_train = pred_train[:, 0]
        prediction_test = pred_test[:, 0]
    else:
        prediction_train = pred_train
        prediction_test = pred_test

    d_sig, p_value_sig, d_bkg, p_value_bkg = ks_test.ks_test(
        prediction_train, prediction_test, y_train, y_test)
    print(
        ("Results of ks-test (d-score) for signal: %.10f and background: %.10f"
         % (d_sig, d_bkg)))
    print(
        ("Results of ks-test (p-value) for signal: %.10f and background: %.10f"
         % (p_value_sig, p_value_bkg)))

    # roc curves
    fpr_train, tpr_train, thresh_train = metrics.roc_curve(
        y_train, prediction_train, pos_label=1, sample_weight=weights_train)
    fpr_test, tpr_test, thresh_test = metrics.roc_curve(
        y_test, prediction_test, pos_label=1, sample_weight=weights_test)

    y_train_2016 = ks_test.logical_vector(y_train, year, 2016)
    y_test_2016 = ks_test.logical_vector(y_test, year_validation, 2016)
    prediction_train_2016 = ks_test.logical_vector(prediction_train, year,
                                                   2016)
    prediction_test_2016 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2016)
    weights_train_2016 = ks_test.logical_vector(weights_train, year, 2016)
    weights_test_2016 = ks_test.logical_vector(weights_test, year_validation,
                                               2016)

    y_train_2017 = ks_test.logical_vector(y_train, year, 2017)
    y_test_2017 = ks_test.logical_vector(y_test, year_validation, 2017)
    prediction_train_2017 = ks_test.logical_vector(prediction_train, year,
                                                   2017)
    prediction_test_2017 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2017)
    weights_train_2017 = ks_test.logical_vector(weights_train, year, 2017)
    weights_test_2017 = ks_test.logical_vector(weights_test, year_validation,
                                               2017)

    y_train_2018 = ks_test.logical_vector(y_train, year, 2018)
    y_test_2018 = ks_test.logical_vector(y_test, year_validation, 2018)
    prediction_train_2018 = ks_test.logical_vector(prediction_train, year,
                                                   2018)
    prediction_test_2018 = ks_test.logical_vector(prediction_test,
                                                  year_validation, 2018)
    weights_train_2018 = ks_test.logical_vector(weights_train, year, 2018)
    weights_test_2018 = ks_test.logical_vector(weights_test, year_validation,
                                               2018)

    if len(y_train_2016) > 0:
        fpr_train_2016, tpr_train_2016, thresh_train_2016 = metrics.roc_curve(
            y_train_2016,
            prediction_train_2016,
            pos_label=1,
            sample_weight=weights_train_2016)
        fpr_test_2016, tpr_test_2016, thresh_test_2016 = metrics.roc_curve(
            y_test_2016,
            prediction_test_2016,
            pos_label=1,
            sample_weight=weights_test_2016)
        auc_2016, unc_2016, blah, blah, blah = utils.auc_and_unc(
            y_test_2016, prediction_test_2016, weights_test_2016, 25)
        print(("Testing  AUC (2016): %.3f +/- %.4f" % (auc_2016, unc_2016)))
        numpy.savez("bdt_roc_2016_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2016,
                    y_test=y_test_2016,
                    prediction_train=prediction_train_2016,
                    prediction_test=prediction_test_2016,
                    fpr_train=fpr_train_2016,
                    fpr_test=fpr_test_2016,
                    tpr_train=tpr_train_2016,
                    tpr_test=tpr_test_2016)

    if len(y_train_2017) > 0:
        fpr_train_2017, tpr_train_2017, thresh_train_2017 = metrics.roc_curve(
            y_train_2017,
            prediction_train_2017,
            pos_label=1,
            sample_weight=weights_train_2017)
        fpr_test_2017, tpr_test_2017, thresh_test_2017 = metrics.roc_curve(
            y_test_2017,
            prediction_test_2017,
            pos_label=1,
            sample_weight=weights_test_2017)
        auc_2017, unc_2017, blah, blah, blah = utils.auc_and_unc(
            y_test_2017, prediction_test_2017, weights_test_2017, 25)
        print(("Testing  AUC (2017): %.3f +/- %.4f" % (auc_2017, unc_2017)))
        numpy.savez("bdt_roc_2017_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2017,
                    y_test=y_test_2017,
                    prediction_train=prediction_train_2017,
                    prediction_test=prediction_test_2017,
                    fpr_train=fpr_train_2017,
                    fpr_test=fpr_test_2017,
                    tpr_train=tpr_train_2017,
                    tpr_test=tpr_test_2017)

    if len(y_train_2018) > 0:
        fpr_train_2018, tpr_train_2018, thresh_train_2018 = metrics.roc_curve(
            y_train_2018,
            prediction_train_2018,
            pos_label=1,
            sample_weight=weights_train_2018)
        fpr_test_2018, tpr_test_2018, thresh_test_2018 = metrics.roc_curve(
            y_test_2018,
            prediction_test_2018,
            pos_label=1,
            sample_weight=weights_test_2018)
        auc_2018, unc_2018, blah, blah, blah = utils.auc_and_unc(
            y_test_2018, prediction_test_2018, weights_test_2018, 25)
        print(("Testing  AUC (2018): %.3f +/- %.4f" % (auc_2018, unc_2018)))
        numpy.savez("bdt_roc_2018_%s.npz" % (args.channel + "_" + args.tag),
                    y_train=y_train_2018,
                    y_test=y_test_2018,
                    prediction_train=prediction_train_2018,
                    prediction_test=prediction_test_2018,
                    fpr_train=fpr_train_2018,
                    fpr_test=fpr_test_2018,
                    tpr_train=tpr_train_2018,
                    tpr_test=tpr_test_2018)

    auc_train = metrics.auc(fpr_train, tpr_train, reorder=True)
    auc_test = metrics.auc(fpr_test, tpr_test, reorder=True)

    auc, unc, blah, blah, blah = utils.auc_and_unc(y_test, prediction_test,
                                                   weights_test, 25)

    results["auc_train"] = auc_train
    results["auc_test"] = auc_test
    results["auc_test_unc"] = unc
    if "skip_tree" in list(config.keys()):
        return results

    print(("Training AUC: %.3f" % auc_train))
    print(("Testing  AUC: %.3f" % auc_test))

    print(("Testing  AUC: %.3f +/- %.4f" % (auc, unc)))

    numpy.savez("bdt_roc_%s.npz" % (args.channel + "_" + args.tag),
                y_train=y_train,
                y_test=y_test,
                prediction_train=prediction_train,
                prediction_test=prediction_test,
                fpr_train=fpr_train,
                fpr_test=fpr_test,
                tpr_train=tpr_train,
                tpr_test=tpr_test)

    # Write output to TTree
    tree_train_id = numpy.concatenate(
        (numpy.zeros(len(pred_train)), numpy.ones(len(pred_test)),
         numpy.ones(len(pred_data)), numpy.ones(len(pred_final_fit))))
    tree_sample_id = numpy.concatenate(
        (label, label_validation, label_data, numpy.ones(len(pred_final_fit))))
    tree_mass = numpy.concatenate(
        (mass, mass_validation, mass_data, mass_final_fit))
    tree_weight = numpy.concatenate(
        (weights, weights_validation, weights_data, weights_final_fit))
    tree_signal_mass_label = numpy.concatenate(
        (signal_mass_label, signal_mass_label_validation,
         signal_mass_label_data, numpy.zeros(len(pred_final_fit))))
    tree_signal_mass_category = numpy.concatenate(
        (signal_mass_category, signal_mass_category_validation,
         signal_mass_category_data, numpy.zeros(len(pred_final_fit))))
    tree_tth_2017_reference_mva = numpy.concatenate(
        (tth_2017_reference_mva, tth_2017_reference_mva_validation,
         tth_2017_reference_mva_data, tth_2017_reference_mva_final_fit))
    tree_evt = numpy.concatenate(
        (evt, evt_validation, evt_data, evt_final_fit))
    tree_tth_runII_mva = numpy.concatenate(
        (tth_runII_mva, tth_runII_mva_validation, tth_runII_mva_data,
         tth_runII_mva_final_fit))
    tree_run = numpy.concatenate(
        (run, run_validation, run_data, run_final_fit))
    tree_lumi = numpy.concatenate(
        (lumi, lumi_validation, lumi_data, lumi_final_fit))
    tree_process_id = numpy.concatenate(
        (process_id, process_id_validation, process_id_data,
         process_id_final_fit))
    tree_year = numpy.concatenate(
        (year, year_validation, year_data, year_final_fit))
    tree_global_features = numpy.concatenate(
        (global_features, global_features_validation, global_features_data,
         global_features_final_fit))

    if ".json" in args.reference_mva:
        tree_dnn_features = numpy.concatenate(
            (global_dnn_features, global_dnn_features_validation,
             global_dnn_features_data, global_dnn_features_final_fit))

    training_feature_names = [
        training_feature_names for i in range(len(label))
    ]
    training_feature_names_validation = [
        training_feature_names for i in range(len(label_validation))
    ]
    training_feature_names_data = [
        training_feature_names for i in range(len(label_data))
    ]
    training_feature_names_final_fit = [
        training_feature_names for i in range(len(label_final_fit))
    ]
    #tree_training_feature_names = numpy.concatenate((training_feature_names, training_feature_names_validation, training_feature_names_data, training_feature_names_final_fit))

    tree_train_id = tree_train_id.astype(numpy.int64)
    tree_sample_id = tree_sample_id.astype(numpy.int64)
    tree_mass = tree_mass.astype(numpy.float64)
    tree_weight = tree_weight.astype(numpy.float64)
    tree_signal_mass_label = tree_signal_mass_label.astype(numpy.int64)
    tree_signal_mass_category = tree_signal_mass_category.astype(numpy.int64)
    tree_tth_2017_reference_mva = tree_tth_2017_reference_mva.astype(
        numpy.float64)
    tree_evt = tree_evt.astype(numpy.uint64)
    tree_tth_runII_mva = tree_tth_runII_mva.astype(numpy.float64)
    tree_run = tree_run.astype(numpy.uint64)
    tree_lumi = tree_lumi.astype(numpy.uint64)
    tree_process_id = tree_process_id.astype(numpy.int64)
    tree_year = tree_year.astype(numpy.int64)
    tree_global_features = tree_global_features.astype(numpy.float64)
    if ".json" in args.reference_mva:
        tree_dnn_features = tree_dnn_features.astype(numpy.float64)


#tree_training_feature_names = tree_training_feature_names.astype(numpy.string_)

    dict = {
        "train_id": tree_train_id,
        "sample_id": tree_sample_id,
        "mass": tree_mass,
        "weight": tree_weight,
        "signal_mass_label": tree_signal_mass_label,
        "signal_mass_category": tree_signal_mass_category,
        "tth_2017_reference_mva": tree_tth_2017_reference_mva,
        "process_id": tree_process_id,
        "year": tree_year,
        "event": tree_evt,
        "lumi": tree_lumi,
        "run": tree_run,
        "global_features": tree_global_features,
        "tth_runII_mva": tree_tth_runII_mva
    }  #, "training_feature_names" : tree_training_feature_names}

    if ".json" in args.reference_mva:
        dict["dnn_global_features"] = tree_dnn_features

    if args.multi:
        tree_bdt_score = []
        for i in range(num_multi_class):
            tree_bdt_score.append(
                numpy.concatenate(
                    (pred_train[:, i], pred_test[:, i], pred_data[:, i],
                     numpy.ones(len(pred_final_fit)))))
            tree_bdt_score[i] = tree_bdt_score[i].astype(numpy.float64)
            dict["mva_score_%d" % i] = tree_bdt_score[i]

    else:
        tree_bdt_score = numpy.concatenate(
            (pred_train, pred_test, pred_data, pred_final_fit))
        tree_bdt_score = tree_bdt_score.astype(numpy.float64)
        dict["mva_score"] = tree_bdt_score

    if args.reference_mva != "none":
        tree_ref_mva_score = numpy.concatenate(
            (pred_ref_train, pred_ref_test, pred_ref_data, pred_ref_final_fit))
        tree_ref_mva_score = tree_ref_mva_score.astype(numpy.float64)
        dict[args.reference_mva_name] = tree_ref_mva_score

    tree_utils.numpy_to_tree(
        dict, "ttH%s_%s_FinalFitTree.root" % (args.channel, args.tag))

    ### Make diagnostic plots ###
    import matplotlib.pyplot as plt

    # variable importance #
    fig = plt.figure()
    xgboost.plot_importance(bdt)
    plt.tight_layout()
    plt.savefig('feature_importance_' + args.channel + '.pdf')

    # make ROC curve #
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.yaxis.set_ticks_position('both')
    ax.grid(True)
    plt.grid(color='black', linestyle='--', linewidth=0.1, which='both')

    plt.plot(fpr_train, tpr_train, color='red', label='Training Set', lw=3)
    plt.plot(fpr_test, tpr_test, color='green', label='Testing Set', lw=3)

    plt.xscale('log')

    plt.xlim([0.005, 1.0])
    plt.ylim([0.3, 1.05])
    plt.xlabel('False Positive Rate (background efficiency)')
    plt.ylabel('True Positive Rate (signal efficiency)')
    plt.legend(loc='lower right')
    plt.savefig('roc' + args.channel + '.pdf', bbox_inches='tight')

    estimate_za = True
    use_tth_runII_mva = False
    use_tth_2017_mva = False
    if estimate_za:
        n_quantiles = 30

        if args.multi:
            signal_mva_scores = {}
            bkg_mva_scores = {}
            data_mva_scores = {}
            for i in range(
                    0, num_multi_class - 1
            ):  # optimize with each of the bkg probabilities (the signal probability is redundant, i.e. sum of probabilities = 1)
                reverse = 1 if i == 0 else -1
                signal_mva_scores[
                    "bdt_score_%d" % i] = reverse * ks_test.logical_vector(
                        pred_test[:, i], y_test, 1
                    )  # factor of -1 so that we cut *below* certain values, as these are background probabilities, not signal
                bkg_mva_scores["bdt_score_%d" %
                               i] = reverse * ks_test.logical_vector(
                                   pred_test[:, i], y_test, 0)
                data_mva_scores["bdt_score_%d" % i] = reverse * pred_data[:, i]
        elif use_tth_runII_mva:
            print "Using RunII MVA from flashgg"
            signal_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_runII_mva_validation, y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_runII_mva_validation, y_test, 0)
            }
            data_mva_scores = {"bdt_score": tth_runII_mva_data}
        elif use_tth_2017_mva:
            print "Using 2017 ttH PAS MVA"
            signal_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_2017_reference_mva_validation,
                                       y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score":
                ks_test.logical_vector(tth_2017_reference_mva_validation,
                                       y_test, 0)
            }
            data_mva_scores = {"bdt_score": tth_2017_reference_mva_data}
        else:
            print "Using the MVA we just trained"
            signal_mva_scores = {
                "bdt_score": ks_test.logical_vector(pred_test, y_test, 1)
            }
            bkg_mva_scores = {
                "bdt_score": ks_test.logical_vector(pred_test, y_test, 0)
            }
            data_mva_scores = {"bdt_score": pred_data}

        signal_mass = ks_test.logical_vector(mass_validation, y_test, 1)
        bkg_mass = ks_test.logical_vector(mass_validation, y_test, 0)

        signal_njets = ks_test.logical_vector(njets_validation, y_test, 1)
        bkg_njets = ks_test.logical_vector(njets_validation, y_test, 0)

        signal_weights = ks_test.logical_vector(weights_validation, y_test, 1)
        #if args.channel == "Leptonic" and "FCNC" in args.input:
        #  signal_weights *= 1./1.53 # to account for bug in MC sample where W->lv decays don't include taus
        bkg_weights = ks_test.logical_vector(weights_validation, y_test, 0)

        bkg_process_id = ks_test.logical_vector(process_id_validation, y_test,
                                                0)

        optimization_vars = args.optimization_vars.split(
            ",") if args.optimization_vars else []
        for var in optimization_vars:
            signal_mva_scores[var] = ks_test.logical_vector(
                utils.load_array(f, var + '_validation'), y_test, 1)
            bkg_mva_scores[var] = ks_test.logical_vector(
                utils.load_array(f, var + '_validation'), y_test, 0)
            data_mva_scores[var] = utils.load_array(f, var + '_data')

        signal_events = {
            "mass": signal_mass,
            "weights": signal_weights,
            "mva_score": signal_mva_scores
        }
        bkg_events = {
            "mass": bkg_mass,
            "weights": bkg_weights,
            "mva_score": bkg_mva_scores,
            "process_id": bkg_process_id
        }
        data_events = {
            "mass": mass_data,
            "weights": weights_data,
            "mva_score": data_mva_scores,
            "process_id": numpy.ones_like(mass_data)
        }

        # Trim these dictionaries down
        #for evts_dict in [signal_events, bkg_events, data_events]:
        #  good_indices = [index for index, value in enumerate(evts_dict["mass"]) if value < 180.]
        #  print float(len(good_indices))/float(len(evts_dict["mass"]))
        #  for key in evts_dict.iterkeys():
        #    full_array = evts_dict[key]
        #    trimmed_array = [full_array[i] for i in good_indices]
        #    evts_dict[key] = trimmed_array

        mass_shift = not (
            "FCNC" in args.input
        )  # if we're using FCNC as signal, all Higgs mass points should be 125
        # but, if we're using ttH as signal, we use M127 sample for testing, so need to shift for proper comparison with other M125 samples
        za, za_unc, s, b, sigma_eff = significance_utils.za_scores(
            n_quantiles, signal_events, bkg_events, False, {}, mass_shift)
        za_data, za_unc_data, s_data, b_data, sigma_eff_data = significance_utils.za_scores(
            n_quantiles, signal_events, data_events, True, bkg_events,
            mass_shift)
        za = numpy.asarray(za)

        max_za = numpy.max(za)
        max_za_unc = za_unc[numpy.argmax(za)]
        print((max_za, max_za_unc))

        numpy.savez("za_%s.npz" %
                    (args.channel + "_" + args.ext + "_" + args.tag),
                    za=za,
                    za_unc=za_unc,
                    signal=s,
                    bkg=b,
                    sigma_eff=sigma_eff,
                    za_data=za_data,
                    za_unc_data=za_unc_data,
                    signal_data=s_data,
                    bkg_data=b_data,
                    sigma_eff_data=sigma_eff_data)
        numpy.savez("sigma_eff.npz", sigma_eff=sigma_eff, n_sig=s)

        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        fig = plt.figure()
        ax1 = fig.add_subplot(111)
        ax1.plot(s, za, label='MC', color='red')
        ax1.fill_between(s,
                         numpy.asarray(za) - numpy.asarray(za_unc),
                         numpy.asarray(za) + numpy.asarray(za_unc),
                         color='red',
                         alpha=0.25)
        ax1.plot(s_data, za_data, label='Data', color='black')
        ax1.fill_between(s_data,
                         numpy.asarray(za_data) - numpy.asarray(za_unc_data),
                         numpy.asarray(za_data) + numpy.asarray(za_unc_data),
                         color='black',
                         alpha=0.25)

        plt.xlabel('# Signal Events')
        ax1.set_ylabel('Significance (Z_A)')

        plt.ylim([0.0, 3.0])
        l, r = plt.xlim()
        plt.xlim([1.0, r])

        ax1.legend(loc='upper right')
        plt.savefig('za_curve.pdf')

    return results
Beispiel #6
0
    def evaluate(self):
        self.pred_train = self.bdt.predict(self.d_train)
        self.pred_validation = self.bdt.predict(self.d_validation)

        d_sig, p_value_sig, d_bkg, p_value_bkg = ks_test.ks_test(
            self.pred_train, self.pred_validation, self.Y_train,
            self.Y_validation)
        print((
            "Results of ks-test (d-score) for signal: %.10f and background: %.10f"
            % (d_sig, d_bkg)))
        print((
            "Results of ks-test (p-value) for signal: %.10f and background: %.10f"
            % (p_value_sig, p_value_bkg)))

        self.auc_train, self.unc_train, self.fpr_train, self.tpr_train, self.thresh_train = utils.auc_and_unc(
            self.Y_train, self.pred_train, self.weights_train, 50)
        self.auc_validation, self.unc_validation, self.fpr_validation, self.tpr_validation, self.thresh_validation = utils.auc_and_unc(
            self.Y_validation, self.pred_validation, self.weights_validation,
            50)

        print(
            ("Training AUC: %.3f +/- %.4f" % (self.auc_train, self.unc_train)))
        print(("Testing  AUC: %.3f +/- %.4f" %
               (self.auc_validation, self.unc_validation)))