Ejemplo n.º 1
0
def add_branch_to_rootfile(filename, treename, new_branch, branch_name,
                           overwrite=True):
    """Add a branch to a given ROOT-Tree.

    Add some data (*new_branch*) to the ROOT-file (*filename*) into its tree
    (*treename*) under the branch (*branch_name*)

    Parameters
    ----------
    filename : str
        The name of the file (and its path)
    treename : str
        The name of the tree to save the data in
    new_branch : array-like
        The data to add to the root-file
    branch_name : str
        The name of the branch the data will be written too. This can either be
        a new one or an already existing one, which then will be overwritten.
        No "friend" will be created.
    overwrite : boolean
        NOT IMPLEMENTED!
    """
    from raredecay.tools import data_tools
    from raredecay.globals_ import out

    root_data = {'filenames': filename, 'treename': treename}
    status = data_tools.add_to_rootfile(root_data, new_branch=new_branch,
                                        branch_name=branch_name, overwrite=overwrite)
    if status == 0:
        out.add_output(["Added succesfully", new_branch, "as", branch_name, "to",
                        filename], obj_separator=" ")
    elif status == 1:
        out.add_output(["Did not add", new_branch, "as", branch_name, "to",
                        filename, "because it already exists and overwrite is set to false"],
                       obj_separator=" ")
Ejemplo n.º 2
0
def reweightCV(real_data, mc_data, columns=None, n_folds=10,
               reweighter='gb', reweight_cfg=None, n_reweights=1,
               scoring=True, score_columns=None, n_folds_scoring=10, score_clf='xgb',
               mayou_score=False, extended_train_similar=False, apply_weights=True):
    """Reweight data MC/real in a K-Fold way to unbias the reweighting.

    Sophisticated reweighting-algorithms can be quite sensitive to its
    hyperparameters. Therefore, it is good to get an estimation for the
    reweighting quality by reweighting the data itself and "test" it (compare how
    similar the reweighted to the real one is). In order to get an unbiased
    reweighting, a KFolding procedure is applied:

    - the reweighter is trained on n-1/nth of the data and predicts the weights for the 1/n leftover.
    - This is done n times resulting in unbiased weights for the mc data.

    To know, how well the reweighter worked, different stategies can be used
    and are implemented, for further information also see: |reweightingCV_quality_measure_link|

    Parameters
    ----------
    real_data : |hepds_type|
        The real data
    mc_data : |hepds_type|
        The mc data
    columns : list(str, str, str, ...)
        The branches to use for the reweighting.
    n_folds : int > 1
        Number of folds to split the data for the reweighting. Usually, the
        higher the better.
    reweighter : str {'gb', 'bins'}
        Which reweighter to use, either the Gradient Boosted reweighter or the
        (normally used) Bins reweighter (both from *hep_ml*)
    reweight_cfg : dict
        A dict containing all the keyword arguments for the configuration of
        the reweighters.
    n_reweights : int
        As the reweighting often yields different weights depending on random
        parameters like the splitting of the data, the new weights can be
        produced by taking the average of the weights over many reweighting
        runs. n_reweights is the number of reweight runs to average over.
    scoring : boolean
        If True, the data is not only reweighted with KFolding but also several
        scoring metrics are tested.

        - Data-ROC : The data (mc reweighted and real mixed) is split in
          KFolds, a classifier is then trained on the training fold and tested
          on the test-fold. This is done K times and the roc curve is
          evaluated. It is a good measure, basically, for how well two datasets
          can be distinguished *but* can be "overfitted". Having too high,
          single weights can lead to a roc curve significantly lower then 0.5
          and therefore only a good indication but not a single measure of
          quality for the reweighter hyper-parameter search.
        - mcreweighted_as_real : n-1/n part of the data is trained on the
          reweighter and the last 1/n part is then reweighted (as described
          above). We can train a classifier on the mc (not reweighted) as
          well as the real data (so a classifier which "distinguishes" between
          mc and real) and predict:

          - (not in training used) mc (not reweighted) and label it as if it
            were real data.
          - (not in training used) mc reweighted and label it as if it were
            real data.
          - (not in training used) real data and label it real.

          Then we look at the tpr (we cannot look at the ROC as we only inserted
          one class of labels; real) and therefore at "how many of the
          datapoints we inserted did the classifier predict as real?":

          The score for the real data should be the highest, the one for the
          mc not reweighted the lowest. The reweighted one should be somewhere
          in between (most probably). It is **not** the goal to maximise the
          tpr for the mc reweighted (by changing the reweighter hyper-parameters)
          as high, single weights (which occure when overfitting) will increase
          the tpr drastically.
        - train_similar: The probably most stable score to find the gbreweighter
          hyper-parameters. The data is split into KFolds and a classifier is
          trained on the mc reweighted and real data. Then it predicts the
          (not yet seen) real data. The more it is able to predict as real,
          the more it was able to learn from the differences of the datasets.
          This scoring cannot overfit the same way the one above because a
          single, high weight will cause a very bad distribution of the mc
          data and therefore the classifier will be able to predict nearly
          every real data as real (only *one single point*, the one with
          the high weight, will be predicted as mc, the rest as real)
    score_columns : list(str, str, str,...)
        The columns to use for the scoring. They should not be the same as for
        the reweighting in order to unbias the score. It is usually a good
        idea to use the same branches as will be used for the selection
        training later on.
    n_folds_scoring : int > 1
        The number of folds to split the data into for the scoring
        described above.
    score_clf : str or dict or clf
        The classifier to use for the scoring. For an overview of what can be
        used, see :py:func:`~raredecay.analysis.ml_analysis.make_clf()`.
    mayou_score : boolean
        If True, the experimental *mayou_score* will be generated.
    extended_train_similar : boolean
        If True, an experimental score will be generated.
    apply_weights : boolean
        If True, set the new weights to the MC data in place. This changes the
        weights in the data-storage.


    Return
    ------
    out : dict
        The output is a dictionary containing the different scores and/or the
        new weights. The keywords are:

        - *weights* : pandas Series containing the new weights
        - *mcreweighted_as_real_score* : The scores of this method in a dict
        - *train_similar* : The scores of this method in a dict
        - *roc_auc_score* : The scores of this method in a dict
    """
    import numpy as np

    from raredecay.tools import metrics
    from raredecay.globals_ import out

    output = {}
    # do the Kfold reweighting. This reweights the data with Kfolding and returns
    # the weights. If add_weights_to_data is True, the weights will automatically be
    # added to the mc_data (or here, reweight_mc). To get an estimate
    # wheter it has over-fitted, you can get the mcreweighted_as_real_score.
    # This trains a clf on mc/real and tests it on mc, mc reweighted, real
    # but both labeled with the same target as the real data in training
    # The mc reweighted score should therefore lie in between the mc and the
    # real score.
    #    if not apply_weights:
    old_weights = mc_data.get_weights()
    # make sure the targets are set the right way TODO
    Kfold_output = ml_ana.reweight_Kfold(mc_data=mc_data, real_data=real_data,
                                         meta_cfg=reweight_cfg, columns=columns,
                                         reweighter=reweighter,
                                         n_reweights=n_reweights,
                                         mcreweighted_as_real_score=scoring,
                                         score_columns=score_columns,
                                         n_folds=n_folds, score_clf=score_clf,
                                         add_weights_to_data=True)
    new_weights = Kfold_output.pop('weights')
    # TODO: needed below?
    new_weights.sort_index()

    if scoring:
        output['mcreweighted_as_real_score'] = Kfold_output

        # To get a good estimation for the reweighting quality, the
        # train_similar score can be used. Its the one with training on
        # mc reweighted/real and test on real, quite robust.
        # Test_max is nice to know too even dough it can also be set to False if
        # testing the same distribution over and over again, as it is the same for
        # the same distributions (actually, it's just doing the score without the
        # weights).
        # test_predictions is an additional score I tried but so far I is not
        # reliable or understandable at all. The output, the scores dictionary,
        # is better described in the docs of the train_similar
        scores = ml_scores.train_similar_new(mc=mc_data, real=real_data, test_max=True,
                                                             n_folds=n_folds_scoring,
                                                             n_checks=n_folds_scoring,
                                                             columns=score_columns,
                                                             old_mc_weights=old_weights,
                                                             clf=score_clf)

        # scores = metrics.train_similar(mc_data=mc_data, real_data=real_data, test_max=True,
        #                                n_folds=n_folds_scoring, n_checks=n_folds_scoring,
        #                                features=score_columns, old_mc_weights=old_weights,
        #                                test_mc=extended_train_similar,
        #                                test_shuffle=extended_train_similar,
        #                                test_predictions=False, clf=score_clf)
        out.add_output(['Mayou FoM:', scores], to_end=True)

        # We can of course also test the normal ROC curve. This is weak to overfitting
        # but anyway (if not overfitting) a nice measure. You insert two datasets
        # and do the normal cross-validation on it. It's quite a multi-purpose
        # function depending on what validation is. If it is an integer, it means:
        # do cross-validation with n(=validation) folds.
        temp_mc_targets = mc_data.get_targets()
        mc_data.set_targets(0)
        temp_real_targets = real_data.get_targets()
        real_data.set_targets(1)
        tmp_, roc_auc_score, output = ml_ana.classify(original_data=mc_data, target_data=real_data,
                                                      validation=n_folds_scoring, plot_importance=4,
                                                      plot_title="ROC AUC to distinguish data",
                                                      clf=score_clf, weights_ratio=1,
                                                      features=score_columns,
                                                      extended_report=scoring,
                                                      get_predictions=True)
        del tmp_
        # HACK
        predictions = output['y_proba'][:, 1][output['y_true'] == 0]
        weights_pred = np.log(output['weights'][output['y_true'] == 0])
        weights_pred = output['weights'][output['y_true'] == 0]
        out.figure("Correlation of weights and predictions")
        plt.scatter(predictions, weights_pred)
        plt.xlabel("predictions")
        plt.ylabel("weights")
        out.figure("Correlation of weights and predictions hexbin")
        plt.hexbin(x=predictions, y=weights_pred, gridsize=150)
        #        sns.jointplot(x=predictions, y=weights_pred, kind="hex")



        if mayou_score:
            ml_scores.mayou_score(mc_data=mc_data, real_data=real_data,
                                                  n_folds=n_folds_scoring,
                                                  clf=score_clf, old_mc_weights=old_weights)
            # an example to add output with the most importand parameters. The first
            # one can also be a single object instead of a list. do_print means
            # printing it also to the console instead of only saving it to the output
            # file. To_end is sometimes quite useful, as it prints (and saves) the
            # arguments at the end of the file. So the important results are possibly
            # printed to the end
        out.add_output(['ROC AUC score:', roc_auc_score], importance=5,
                       title='ROC AUC of mc reweighted/real KFold', to_end=True)
        # TODO? NEW SCORES?
        #
        # out.add_output(['score:', scores['score'], "+-", scores['score_std']],
        #                importance=5,
        #                title='Train similar report', to_end=True)
        # if extended_train_similar:
        #     out.add_output(['\nScore_mc:', scores['score_mc'], "+-", scores['score_mc_std']],
        #                    importance=5,
        #                    to_end=True)
        # if scores.get('score_max', False):
        #     out.add_output(['score max:', scores['score_max'], "+-", scores['score_max_std']],
        #                    importance=5, to_end=True)
        # if scores.get('score_mc_max', False):
        #     out.add_output(['score_mc_max:', scores['score_mc_max'], "+-",
        #                     scores['score_mc_max_std']],
        #                    importance=5, to_end=True)

        if scores.get('score_shuffled', False):
            out.add_output(['score_shuffled:', scores['score_shuffled'], "+-",
                            scores['score_shuffled_std']],
                           importance=5, to_end=True)
        output['train_similar'] = scores
        output['roc_auc'] = roc_auc_score

    output['weights'] = new_weights
    if not apply_weights:
        mc_data.set_weights(old_weights)

    if scoring:
        mc_data.set_targets(temp_mc_targets)
        real_data.set_targets(temp_real_targets)

    return output
Ejemplo n.º 3
0
def train_similar(mc_data, real_data, features=None, n_checks=10, n_folds=10,
                  clf='xgb', test_max=True, test_shuffle=True, test_mc=False,
                  old_mc_weights=1, test_predictions=False, clf_pred='rdf'):
    """Score for reweighting. Train clf on mc reweighted/real, test on real; minimize score.

    Enter two datasets and evaluate the score described below. Return a
    dictionary containing the different scores. The test_predictions is
    another scoring, which is built upon the train_similar method.

    **Scoring method description**

    **Idea**:
    A clf is trained on the reweighted mc as well as on the real data of a
    certain decay. Therefore, the classifier learns to distinguish between
    Monte-Carlo data and real data. Then we let the classifier predict some
    real data (an unbiased test set) and see, how many he is able to classify
    as real events. The lower the score, the less differences he was able to
    learn from the train data therefore the more similar the train data
    therefore the better the reweighting.

    **Advandages**: It is quite difficult to cheat on this method. Most of all
    it is robust to single high-weight events (which mcreweighted_as_real is
    not) and, in general, seems to be the best scoring so far.

    **Disadvantages**: If you insert a gaussian shaped 1.0 as mc and a gaussian
    shaped 1.1 as real, the score will be badly (around 0.33). So far, this was
    only observed for "artificial" distributions (even dough, of course, we
    do not know if it affects real distributions aswell partly)

    **Output explanation**

    The return is a dictionary containing several values. Of course, only the
    values, which are set to be evaluated, are contained. The keys are:

    - '**score**' : The average of all train_similar scores (as we use KFolding,
      there will be n_folds scores). *The* score.
    - '**score_std**' : The std of a single score, just for curiosity
    - '**score_max**' : The (average of all) "maximum" score. Actually the
      train_similar score but
      with mc instead of *reweighted* mc. Should be higher then the
      reweighted score.
    - '**score_max_std**' : The std of a single score, just for curiosity
    - '**score_pred**' : The score of the test_predictions method.
    - '**score_mc_pred**' : The score of the test_predictions method but on the
      predictions of the mc instead of the *reweighted* mc.

    Parameters
    ----------
    mc_data : HEPDataStorage
        The reweighted Monte-Carlo data, assuming the new weights are applied
        already.
    real_data : HEPDataStorage
        The real data
    n_checks : int >= 1
        Number of checks to perform. Has to be <= n_folds
    n_folds : int > 1
        Number of folds the data will be split into
    clf : str
        The name of a classifier to be used in
        :py:func:`~raredecay.analysis.ml_analysis.classify`.
    test_max : boolean
        If true, test for the "maximum value" by training also on mc/real
        (instead of *reweighted* mc/real)
        and test on real. The score for only mc should be higher than for
        reweighted mc/real. It *should* most probably but does not have to
        be!
    old_mc_weights : array-like or 1
        If *test_max* is True, the weights for mc before reweighting will be
        taken to be *old_mc_weights*, the weights the mc distribution had
        before the reweighting. The default is 1.
    test_predictions : boolean
        If true, try to distinguish the predictions. Advanced feature and not
        yet really discoverd how to interpret. Gives very high ROC somehow.
    clf_pred : str
        The classifier to be used to distinguish the predictions. Required for
        the *test_predictions*.

    Return
    ------
    out : dict
        A dictionary conaining the different scores. Description see above.

    """
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.globals_ import out

    # initialize variables
    assert 1 <= n_checks <= n_folds and n_folds > 1, "wrong n_checks/n_folds. Check the docs"
    assert isinstance(mc_data, data_storage.HEPDataStorage), \
        "mc_data wrong type:" + str(type(mc_data)) + ", has to be HEPDataStorage"
    assert isinstance(real_data, data_storage.HEPDataStorage), \
        "real_data wrong type:" + str(type(real_data)) + ", has to be HEPDataStorage"
#    assert isinstance(clf, str),\
#        "clf has to be a string, the name of a valid classifier. Check the docs!"

    output = {}

    scores = np.ones(n_checks)
    scores_shuffled = np.ones(n_checks)
    scores_mc = np.ones(n_checks)
    scores_max = np.ones(n_checks)  # required due to output of loop
    scores_mc_max = np.ones(n_checks)
#    scores_weighted = []
    scores_max_weighted = []
    probas_mc = []
    probas_reweighted = []
    weights_mc = []
    weights_reweighted = []

    real_pred = []
    real_test_index = []
    real_mc_pred = []

    # initialize data
    tmp_mc_targets = mc_data.get_targets()
    mc_data.set_targets(0)
    real_data.make_folds(n_folds=n_folds)
    if test_mc:
        mc_data.make_folds(n_folds=n_folds)
    for fold in range(n_checks):
        real_train, real_test = real_data.get_fold(fold)
        if test_mc:
            mc_train, mc_test = mc_data.get_fold(fold)
            mc_test.set_targets(0)
        else:
            mc_train = mc_data.copy_storage()
        mc_train.set_targets(0)

        real_test.set_targets(1)
        real_train.set_targets(1)

        tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                  plot_title="train on mc reweighted/real, test on real",
                                  weights_ratio=1, get_predictions=True,
                                  features=features,
                                  plot_importance=1, importance=1)
        clf_trained, scores[fold], pred_reweighted = tmp_out

        tmp_weights = mc_train.get_weights()

        if test_shuffle:
            import copy
            shuffled_weights = copy.deepcopy(tmp_weights)
            shuffled_weights.reindex(np.random.permutation(shuffled_weights.index))
            mc_train.set_weights(shuffled_weights)
            tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                      plot_title="train on mc reweighted/real, test on real",
                                      weights_ratio=1, get_predictions=True,
                                      features=features,
                                      plot_importance=1, importance=1)
            scores_shuffled[fold] = tmp_out[1]
            mc_train.set_weights(tmp_weights)

        if test_mc:
            clf_trained, scores_mc[fold] = ml_ana.classify(validation=mc_test,
                                                           clf=clf_trained,
                                                           plot_title="train on mc reweighted/real, test on mc",
                                                           weights_ratio=1, get_predictions=False,
                                                           features=features,
                                                           plot_importance=1,
                                                           importance=1)

#        del clf_trained, tmp_pred
        probas_reweighted.append(pred_reweighted['y_proba'])
        weights_reweighted.append(pred_reweighted['weights'])

        real_pred.extend(pred_reweighted['y_pred'])
        real_test_index.extend(real_test.get_index())

        if test_max:
            temp_weights = mc_data.get_weights()
            mc_data.set_weights(old_mc_weights)
            tmp_out = ml_ana.classify(mc_data, real_train, validation=real_test,
                                      plot_title="real/mc NOT reweight trained, validate on real",
                                      weights_ratio=1, get_predictions=True, clf=clf,
                                      features=features,
                                      plot_importance=1, importance=1)
            clf_trained, scores_max[fold], pred_mc = tmp_out
            if test_mc:
                clf_trained, scores_mc_max[fold] = ml_ana.classify(validation=mc_test, clf=clf_trained,
                                                                   plot_title="train on mc NOT reweighted/real, test on mc",
                                                                   weights_ratio=1,
                                                                   get_predictions=False,
                                                                   features=features,
                                                                   plot_importance=1,
                                                                   importance=1)
            del clf_trained
# HACK
            tmp_pred = pred_mc['y_proba'][:, 1] * pred_mc['weights']
            scores_max_weighted.extend(tmp_pred * (pred_mc['y_true'] * 2 - 1))

# HACK END
            mc_data.set_weights(temp_weights)
            probas_mc.append(pred_mc['y_proba'])
            weights_mc.append(pred_mc['weights'])

            real_mc_pred.extend(pred_mc['y_pred'])

    output['score'] = np.round(scores.mean(), 4)
    output['score_std'] = np.round(scores.std(), 4)

    if test_shuffle:
        output['score_shuffled'] = np.round(scores_shuffled.mean(), 4)
        output['score_shuffled_std'] = np.round(scores_shuffled.std(), 4)

    if test_mc:
        output['score_mc'] = np.round(scores_mc.mean(), 4)
        output['score_mc_std'] = np.round(scores_mc.std(), 4)

    out.add_output(["Score train_similar (recall, lower means better): ",
                   str(output['score']) + " +- " + str(output['score_std'])],
                   subtitle="Clf trained on real/mc reweight, tested on real")
    if test_max:
        output['score_max'] = np.round(scores_max.mean(), 4)
        output['score_max_std'] = np.round(scores_max.std(), 4)
        if test_mc:
            output['score_mc_max'] = np.round(scores_mc_max.mean(), 4)
            output['score_mc_max_std'] = np.round(scores_mc_max.std(), 4)
        out.add_output(["No reweighting score: ", round(output['score_max'], 4)])

    if test_predictions:
        # test on the reweighted/real predictions
        real_data.set_targets(targets=real_pred, index=real_test_index)
        tmp_, score_pred = ml_ana.classify(real_data, target_from_data=True, clf=clf_pred,
                                           features=features,
                                           plot_title="train on predictions reweighted/real, real as target",
                                           weights_ratio=1, validation=n_checks, plot_importance=3)
        output['score_pred'] = round(score_pred, 4)

    if test_predictions and test_max:
        # test on the mc/real predictions
        real_data.set_targets(targets=real_mc_pred, index=real_test_index)
        tmp_, score_mc_pred = ml_ana.classify(real_data, target_from_data=True, clf=clf_pred,
                                              validation=n_checks,
                                              plot_title="mc not rew/real pred, real as target",
                                              weights_ratio=1, plot_importance=3)
        output['score_mc_pred'] = np.round(score_mc_pred, 4)

    mc_data.set_targets(tmp_mc_targets)

    output['similar_dist'] = similar_dist(predictions=np.concatenate(probas_reweighted)[:, 1],
                                          weights=np.concatenate(weights_reweighted))

    return output
Ejemplo n.º 4
0
def preselection_cut(signal_data, bkg_data, percent_sig_to_keep=100):
    """Cut the bkg while maintaining a certain percent of the signal. WIP."""

    # from raredecay import meta_config
    from raredecay.tools import data_tools
    from raredecay.globals_ import out
    # from raredecay.tools.data_storage import HEPDataStorage

    import numpy as np
    import copy

    columns = signal_data.columns
    signal_data.plot(figure="Before cut", title="Data comparison before cut")
    signal_data.plot(figure="Signal comparison",
                     title="Data comparison before cut vs after")
    bkg_data.plot(figure="Background comparison",
                  title="Data comparison before cut vs after")
    bkg_data.plot(figure="Before cut")
    bkg_length = len(bkg_data)
    signal_length = len(signal_data)
    signal_cp = signal_data.copy_storage()
    bkg_cp = bkg_data.copy_storage()
    signal_data = signal_data.pandasDF()
    bkg_data = bkg_data.pandasDF()

    applied_cuts = {}

    percent_end = percent_sig_to_keep
    percent_sig_to_keep = 100
    stepsize = 0.1
    keep = {}

    while True:

        #        pool = multiprocessing.Pool(meta_config.n_cpu_max)
        sig = np.array(
            [signal_data.as_matrix()[:, i] for i, _t in enumerate(columns)])
        sig = copy.deepcopy(sig)
        bkg = np.array(
            [bkg_data.as_matrix()[:, i] for i, _t in enumerate(columns)])
        bkg = copy.deepcopy(bkg)
        data = zip(sig, bkg, [percent_sig_to_keep] * len(columns))
        limits, rejection = [], []
        for sig, bkg, per in data:
            temp = data_tools.apply_cuts(sig, bkg, per, bkg_length=bkg_length)
            limits.append(temp[0])
            rejection.append(temp[1])
#        limits, rejection = pool.map(_cut, data)
        i_max_rej = np.argmax(rejection)
        max_rejection = np.max(rejection)
        column, limits = columns[i_max_rej], limits[i_max_rej]
        print percent_sig_to_keep, percent_end
        if max_rejection < 0.001 and percent_sig_to_keep == 100:
            if percent_end < 100:
                percent_sig_to_keep -= stepsize
            else:
                break
        elif percent_end <= percent_sig_to_keep < 100:
            percent_end += stepsize
            stepsize *= (100 - stepsize) / 100
        elif percent_sig_to_keep < percent_end:
            break

        if column in applied_cuts:
            max_rejection += applied_cuts[column]['reduction']
        applied_cuts[column] = {"limits": limits, "reduction": max_rejection}

        cuts = np.logical_and(signal_data[column] > limits[0],
                              signal_data[column] < limits[1])
        signal_data = signal_data[cuts]

        cuts = np.logical_and(bkg_data[column] > limits[0],
                              bkg_data[column] < limits[1])
        bkg_data = bkg_data[cuts]
        print "We used " + column


#    signal_data.hist(bins=30)
#    bkg_data.hist(bins=30)

    signal_len_cut = len(np.array(signal_data.as_matrix()[:, 0]))
    bkg_len_cut = len(np.array(bkg_data.as_matrix()[:, 0]))
    signal_cp.set_data(signal_data)
    signal_cp.plot(figure="Signal comparison")
    signal_cp.plot(figure="Data cut plt",
                   title="Data with cuts applied",
                   log_y_axes=True)

    bkg_cp.set_data(bkg_data)
    bkg_cp.plot(figure="Background comparison")
    bkg_cp.plot(figure="Data cut plt", log_y_axes=True)

    out.add_output(applied_cuts, section="Preselection cuts report")
    out.add_output(keep, section="All limits")
    bkg_rejection = sum([i['reduction'] for i in applied_cuts.itervalues()])
    out.add_output([
        "summed up Bkg rejection: ", bkg_rejection, "True rejection: ", 100.0 -
        (bkg_len_cut / bkg_length), " True remaining signal: ",
        signal_len_cut / signal_length
    ],
                   section="Total bkg rejection")
    print signal_len_cut
    print signal_length
    print bkg_len_cut
    print bkg_length

    return applied_cuts
Ejemplo n.º 5
0
def mayou_score(mc_data, real_data, features=None, old_mc_weights=1,
                clf='xgb', splits=2, n_folds=10):
    """An experimental score using a "loss" function for data-similarity"""
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.globals_ import out

    # initialize variables
    output = {}
    score_mc_vs_mcr = []
    score_mcr_vs_real = []
#    splits *= 2  # because every split is done with fold 0 and 1 (<- 2 *)

    # loop over number of splits, split the mc data

    mc_data.make_folds(n_folds)
    real_data.make_folds(n_folds)

    # mc reweighted vs mc
    for fold in xrange(n_folds):
        mc_data_train, mc_data_test = mc_data.get_fold(fold)
        # TODO: no real folds? It is better to test on full data always?
#        mc_data_train, mc_data_test = real_data.get_fold(fold)
        for split in xrange(splits * 2):  # because two possibilities per split
            if split % 2 == 0:
                mc_data_train.make_folds(2)
            mc_normal, mc_reweighted = mc_data_train.get_fold(split % 2)
            mc_normal.set_weights(old_mc_weights)
            score_mc_vs_mcr.append(ml_ana.classify(original_data=mc_normal,
                                                   target_data=mc_reweighted,
                                                   features=features,
                                                   validation=[mc_data_test, real_data],
                                                   clf=clf, plot_importance=1,
                                                   # TODO: no weights ratio? (roc auc)
                                                   weights_ratio=0
                                                   )[1])
    out.add_output(["mayou_score mc vs mc reweighted test on mc vs real score: ",
                    score_mc_vs_mcr, "\nMean: ", np.mean(score_mc_vs_mcr),
                    " +-", np.std(score_mc_vs_mcr) / mt.sqrt(len(score_mc_vs_mcr) - 1)],
                   subtitle="Mayou score", to_end=True)

    output['mc_distance'] = np.mean(score_mc_vs_mcr)

    # mc_reweighted vs real
    for fold in xrange(n_folds):
        real_train, real_test = real_data.get_fold(fold)
        mc_train, mc_test = mc_data.get_fold(fold)
        mc_test.set_weights(old_mc_weights)
        score_mcr_vs_real.append(ml_ana.classify(original_data=mc_train,
                                                 target_data=real_train,
                                                 features=features,
                                                 validation=[mc_test, real_test],
                                                 clf=clf, plot_importance=1,
                                                 # TODO: no weights ratio? (roc auc)
                                                 weights_ratio=0
                                                 )[1])

    out.add_output(["mayou_score real vs mc reweighted test on mc vs real score: ",
                    score_mcr_vs_real, "\nMean: ", np.mean(score_mcr_vs_real),
                    " +-", np.std(score_mcr_vs_real) / mt.sqrt(len(score_mcr_vs_real) - 1)],
                   to_end=True)

    output['real_distance'] = np.mean(score_mcr_vs_real)