Beispiel #1
0
def final_training(real_data, mc_data, bkg_sel, clf='xgb', n_folds=10, columns=None,
                   performance_only=True, metric_vs_cut='punzi', weights_ratio=0,
                   save_real_pred=False, save_mc_pred=False):
    """Train on bkg and MC, test metric, performance and predict probabilities.

    The goal of an MVA is to have certain probabilities predicted for each
    event to make further cuts on the data-sample and reduce the background.

    There are two modes to run:
        - **performance_only**: train a clf K-folded on the background and the
          MC and predict, then create the ROC-curve and plot a metric. This
          is to get an idea of how well the classifier performs as well as
          to find the optimal cutoff-value on the predictions.
        - **prediction_mode**: (*set performance_only to False*) train a clf
          on the bkg and MC and predict K-folded the probabilities for all
          data (bkg, MC and the rest) without any event occuring in the
          training-set as well as in the test-set. If a name is given to
          *save_mc_pred* respectively *save_real_pred*, the predicitions will
          be saved to the root-file the data was taken from.

    Parameters
    ----------
    real_data : |hepds_type|
        The real data
    mc_data : |hepds_type|
        The MC data (signal)
    bkg_sel : str or [str]
        A string pointing to a column in the root-tree which tells if an event
        belongs to the bkg (1) to train on or not (0). This typically is
        something like this: (B_M > 5700) or similar
    clf : str or clf or dict, see :py:func:`~raredecay.analysis.ml_analysis.make_clf()`
        The classifier to be used.
    n_folds : int > 1
        The number of folds to use for the training
    columns : list(str, str, str,...)
        The columns to train on
    performance_only : boolean
        If True, the function is run in performance mode and does not predict
        but only creates a ROC-curve and a metric-vs-cut.
    metric_vs_cut : str {'punzi', 'precision'}
        The metric to test on the predictions.
    save_real_pred : str or False
        If provided, the predictions of the real data will be saved to its
        root-tree with the branch name specified here.
    save_mc_pred : str or False
        If provided, the predictions of the MC will be saved to its
        root-tree with the branch name specified here.

    Return
    ------
    out : dict
        Return a dict containing the following keys and values:

        - *best_threshold_cut*: The best cut for the given metric (only if metric_vs_cut is given)
        - *best_metric*: The highest value of the metric, basically "metric(best_threshold_cut)" (only if
            metric_vs_cut is given)
        - *pred_real*: The predicitions of real_data
        - *pred_mc*: The predicitions of mc_data

    """
    from raredecay.globals_ import out
    from raredecay.analysis.ml_analysis import classify
    from raredecay.tools.metrics import punzi_fom, precision_measure

    # Python 2/3 compatibility
    bkg_sel = dev_tool.entries_to_str(bkg_sel)
    clf = dev_tool.entries_to_str(clf)
    columns = dev_tool.entries_to_str(columns)
    metric_vs_cut = dev_tool.entries_to_str(metric_vs_cut)
    save_mc_pred = dev_tool.entries_to_str(save_mc_pred)
    save_real_pred = dev_tool.entries_to_str(save_real_pred)

    # check if predictions can be saved: need to be root-file and no selection applied
    if performance_only:
        save_real_pred = save_mc_pred = False

    output = {}
    if save_real_pred:
        if real_data.data_type != 'root':
            raise TypeError("Real predictions should be saved but data is not a root-file but " +
                            real_data.data_type)
        elif not (real_data.data['selection'] is None):
            raise ValueError("Real pred set to be saved, but has selection " +
                             real_data.data['selection'] + " applied")
    if save_mc_pred:
        if mc_data.data_type != 'root':
            raise TypeError("MC predictions should be saved but data is not a root-file but " +
                            mc_data.data_type)
        elif not (mc_data.data['selection'] is None):
            raise ValueError("MC pred set to be saved, but has selection " +
                             mc_data.data['selection'] + " applied")

    # backwards compatibility


    bkg_sel = [bkg_sel] if not isinstance(bkg_sel, (list, tuple)) else bkg_sel
    if bkg_sel[0].startswith('noexpand:'):
        bkg_sel = bkg_sel[0][9:]
    else:
        bkg_sel = bkg_sel[0]

    pred_real = []
    pred_mc = []

    predict = not performance_only
    if performance_only:
        bkg_df = real_data.pandasDF()
        bkg_df = bkg_df.ix[np.array(bkg_df[bkg_sel].T) == 1]
        bkg_data = real_data.copy_storage()
        bkg_data.set_data(bkg_df)
        del bkg_df
        _clf, kfold_score, pred_tmp = classify(bkg_data, mc_data, validation=n_folds, clf=clf,
                                               get_predictions=True, extended_report=True,
                                               features=columns, weights_ratio=weights_ratio)

        report = pred_tmp['report']

        if metric_vs_cut == 'punzi':
            metric = punzi_fom
            title = "Punzi FoM vs threshold cut on " + real_data.name
        elif metric_vs_cut == 'precision':
            metric = precision_measure
            metric.__name__ = r"precision $\frac {n_{signal}} {\sqrt{n_{signal} + n_{background}}}$"
            title = "Precision vs threshold cut on " + real_data.name
        elif metric_vs_cut:
            raise ValueError("Invalid metric: " + str(metric_vs_cut))

        if metric_vs_cut:
            out.figure(title)
            # plt.legend()
            from rep.report.metrics import OptimalMetric
            metric_optimal = OptimalMetric(metric,
                                           expected_s=sum(mc_data.get_weights(normalize=False)),
                                           expected_b=sum(bkg_data.get_weights(normalize=False))
                                           )
            metric_optimal.plot_vs_cut(y_true=pred_tmp['y_true'],
                                       proba=pred_tmp['y_proba'],
                                       sample_weight=pred_tmp['weights']).plot(fontsize=25)
            best_cut, best_metric = metric_optimal.compute(y_true=pred_tmp['y_true'],
                                                           proba=pred_tmp['y_proba'],
                                                           sample_weight=pred_tmp['weights'])
            best_index = np.argmax(best_metric)
            output = {'best_threshold_cut': best_cut[best_index],
                      'best_metric': best_metric[best_index]
                      }

    # predict to all data
    if predict:

        # make folds and loop through
        real_data.make_folds(n_folds=n_folds)
        mc_data.make_folds(n_folds=n_folds)
        for i in range(n_folds):
            real_train, real_test = real_data.get_fold(i)
            mc_train, mc_test = mc_data.get_fold(i)
            real_train.data_name_addition = "train"
            real_test.data_name_addition = "test"
            bkg_df = real_train.pandasDF()

            bkg_df = bkg_df.ix[np.array(bkg_df[bkg_sel].T) == 1]
            real_train.set_data(bkg_df)
            real_train.data_name_addition = "train bkg"

            real_test_index = real_test.index
            mc_test_index = mc_test.index
            first_example = i == 0  # to plot only the first time
            clf_trained, _, pred_real_tmp = classify(real_train, mc_train, validation=real_test,
                                                     clf=clf, get_predictions=True,
                                                     extended_report=first_example,
                                                     features=columns, weights_ratio=weights_ratio)
            clf_trained, _, pred_mc_tmp = classify(validation=mc_test, clf=clf_trained,
                                                   get_predictions=True,
                                                   extended_report=first_example,
                                                   features=columns, weights_ratio=weights_ratio)

            # collect predictions and index
            pred_real_tmp = pred_real_tmp['y_proba']
            pred_real.append(pd.Series(pred_real_tmp[:, 1], index=real_test_index))
            pred_mc_tmp = pred_mc_tmp['y_proba']
            pred_mc.append(pd.Series(pred_mc_tmp[:, 1], index=mc_test_index))

        # concatenate predictions and index
        pred_real = pd.concat(pred_real)
        pred_real.sort_index(inplace=True)
        pred_mc = pd.concat(pred_mc)
        pred_mc.sort_index(inplace=True)
        output['pred_real'] = pred_real
        output['pred_mc'] = pred_mc

        # save predictions
        if isinstance(save_real_pred, (basestring, int)) and not isinstance(save_real_pred, bool) and predict:
            root_dict = copy.deepcopy(real_data.data)

            if root_dict['selection'] is not None:
                raise ValueError(
                        "Cannot save predictions to root as selections have been applied in the script")

            add_branch_to_rootfile(filename=root_dict['filenames'],
                                   treename=root_dict['treename'],
                                   new_branch=pred_real, branch_name=save_real_pred)

        if isinstance(save_mc_pred, (basestring, int)) and not isinstance(save_mc_pred, bool) and predict:
            root_dict = copy.deepcopy(mc_data.data)

            if root_dict['selection'] is not None:
                raise ValueError(
                        "Cannot save predictions to root as selections have been applied in the script")

            add_branch_to_rootfile(filename=root_dict['filenames'],
                                   treename=root_dict.get('treename'),
                                   new_branch=pred_mc, branch_name=save_mc_pred)
        out.figure("predictions total")
        plt.legend()
        plt.title("Predictions of MC vs all real data")
        plt.hist(pred_real, bins=30)
        plt.hist(pred_mc, bins=30)

        out.figure("predictions total normalized")
        plt.legend()
        plt.title("Predictions of MC vs all real data normalized")
        plt.hist(pred_real, bins=30, density=True, alpha=0.5, range=(0, 1))
        plt.hist(pred_mc, bins=30, density=True, alpha=0.5, range=(0, 1))

    return output
Beispiel #2
0
def reweightCV(real_data,
               mc_data,
               columns=None,
               n_folds=10,
               reweighter='gb',
               reweight_cfg=None,
               n_reweights=1,
               scoring=True,
               score_columns=None,
               n_folds_scoring=10,
               score_clf='xgb',
               mayou_score=False,
               extended_train_similar=False,
               apply_weights=True):
    """Reweight data MC/real in a K-Fold way to unbias the reweighting.

    Sophisticated reweighting-algorithms can be quite sensitive to its
    hyperparameters. Therefore, it is good to get an estimation for the
    reweighting quality by reweighting the data itself and "test" it (compare how
    similar the reweighted to the real one is). In order to get an unbiased
    reweighting, a KFolding procedure is applied:

    - the reweighter is trained on n-1/nth of the data and predicts the
      weights for the 1/n leftover. This is done n times resulting in unbiased
      weights for the mc data.

    To know, how well the reweighter worked, different stategies can be used
    and are implemented, for further information also see: |reweightingCV_quality_measure_link|
    Parameters
    ----------
    real_data : |hepds_type|
        The real data
    mc_data : |hepds_type|
        The mc data
    columns : list(str, str, str, ...)
        The branches to use for the reweighting.
    n_folds : int > 1
        Number of folds to split the data for the reweighting. Usually, the
        higher the better.
    reweighter : str {'gb', 'bins'}
        Which reweighter to use, either the Gradient Boosted reweighter or the
        (normally used) Bins reweighter (both from *hep_ml*)
    reweight_cfg : dict
        A dict containing all the keyword arguments for the configuration of
        the reweighters.
    n_reweights : int
        As the reweighting often yields different weights depending on random
        parameters like the splitting of the data, the new weights can be
        produced by taking the average of the weights over many reweighting
        runs. n_reweights is the number of reweight runs to average over.
    scoring : boolean
        If True, the data is not only reweighted with KFolding but also several
        scoring metrics are tested.

        - Data-ROC : The data (mc reweighted and real mixed) is split in
          KFolds, a classifier is then trained on the training fold and tested
          on the test-fold. This is done K times and the roc curve is
          evaluated. It is a good measure, basically, for how well two datasets
          can be distinguished *but* can be "overfitted". Having too high,
          single weights can lead to a roc curve significantly lower then 0.5
          and therefore only a good indication but not a single measure of
          quality for the reweighter hyper-parameter search.
        - mcreweighted_as_real : n-1/n part of the data is trained on the
          reweighter and the last 1/n part is then reweighted (as described
          above). We can train a classifier on the mc (not reweighted) as
          well as the real data (so a classifier which "distinguishes" between
          mc and real) and predict:

          - (not in training used) mc (not reweighted) and label it as if it
            were real data.
          - (not in training used) mc reweighted and label it as if it were
            real data.
          - (not in training used) real data and label it real.

          Then we look at the tpr (we cannot look at the ROC as we only inserted
          one class of labels; real) and therefore at "how many of the
          datapoints we inserted did the classifier predict as real?":

          The score for the real data should be the highest, the one for the
          mc not reweighted the lowest. The reweighted one should be somewhere
          in between (most probably). It is **not** the goal to maximise the
          tpr for the mc reweighted (by changing the reweighter hyper-parameters)
          as high, single weights (which occure when overfitting) will increase
          the tpr drastically.
        - train_similar: The probably most stable score to find the gbreweighter
          hyper-parameters. The data is split into KFolds and a classifier is
          trained on the mc reweighted and real data. Then it predicts the
          (not yet seen) real data. The more it is able to predict as real,
          the more it was able to learn from the differences of the datasets.
          This scoring cannot overfit the same way the one above because a
          single, high weight will cause a very bad distribution of the mc
          data and therefore the classifier will be able to predict nearly
          every real data as real (only *one single point*, the one with
          the high weight, will be predicted as mc, the rest as real)
    score_columns : list(str, str, str,...)
        The columns to use for the scoring. They should not be the same as for
        the reweighting in order to unbias the score. It is usually a good
        idea to use the same branches as will be used for the selection
        training later on.
    n_folds_scoring : int > 1
        The number of folds to split the data into for the scoring
        described above.
    score_clf : str or dict or clf
        The classifier to use for the scoring. For an overview of what can be
        used, see :py:function:`~raredecay.analysis.ml_analysis.make_clf()`.
    mayou_score : boolean
        If True, the experimental *mayou_score* will be generated.
    extended_train_similar : boolean
        If True, an experimental score will be generated.
    apply_weights : boolean
        If True, set the new weights to the MC data in place. This changes the
        weights in the data-storage.


    Return
    ------
    out : dict
        The output is a dictionary containing the different scores and/or the
        new weights. The keywords are:

        - *weights* : pandas Series containing the new weights
        - *mcreweighted_as_real_score* : The scores of this method in a dict
        - *train_similar* : The scores of this method in a dict
        - *roc_auc_score* : The scores of this method in a dict
    """
    import numpy as np

    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.tools import metrics
    from raredecay.globals_ import out

    output = {}
    # do the Kfold reweighting. This reweights the data with Kfolding and returns
    # the weights. If add_weights_to_data is True, the weights will automatically be
    # added to the mc_data (or here, reweight_mc). To get an estimate
    # wheter it has over-fitted, you can get the mcreweighted_as_real_score.
    # This trains a clf on mc/real and tests it on mc, mc reweighted, real
    # but both labeled with the same target as the real data in training
    # The mc reweighted score should therefore lie in between the mc and the
    # real score.
    #    if not apply_weights:
    old_weights = mc_data.get_weights()
    # make sure the targets are set the right way TODO
    Kfold_output = ml_ana.reweight_Kfold(mc_data=mc_data,
                                         real_data=real_data,
                                         meta_cfg=reweight_cfg,
                                         columns=columns,
                                         reweighter=reweighter,
                                         n_reweights=n_reweights,
                                         mcreweighted_as_real_score=scoring,
                                         score_columns=score_columns,
                                         n_folds=n_folds,
                                         score_clf=score_clf,
                                         add_weights_to_data=True)
    new_weights = Kfold_output.pop('weights')
    # TODO: needed below?
    new_weights.sort_index()

    if scoring:
        output['mcreweighted_as_real_score'] = Kfold_output

        # To get a good estimation for the reweighting quality, the
        # train_similar score can be used. Its the one with training on
        # mc reweighted/real and test on real, quite robust.
        # Test_max is nice to know too even dough it can also be set to False if
        # testing the same distribution over and over again, as it is the same for
        # the same distributions (actually, it's just doing the score without the
        # weights).
        # test_predictions is an additional score I tried but so far I is not
        # reliable or understandable at all. The output, the scores dictionary,
        # is better described in the docs of the train_similar
        scores = metrics.train_similar(mc_data=mc_data,
                                       real_data=real_data,
                                       test_max=True,
                                       n_folds=n_folds_scoring,
                                       n_checks=n_folds_scoring,
                                       features=score_columns,
                                       old_mc_weights=old_weights,
                                       test_mc=extended_train_similar,
                                       test_shuffle=extended_train_similar,
                                       test_predictions=False,
                                       clf=score_clf)
        out.add_output(['Mayou FoM:', scores['similar_dist']], to_end=True)

        # We can of course also test the normal ROC curve. This is weak to overfitting
        # but anyway (if not overfitting) a nice measure. You insert two datasets
        # and do the normal cross-validation on it. It's quite a multi-purpose
        # function depending on what validation is. If it is an integer, it means:
        # do cross-validation with n(=validation) folds.
        temp_mc_targets = mc_data.get_targets()
        mc_data.set_targets(0)
        temp_real_targets = real_data.get_targets()
        real_data.set_targets(1)
        tmp_, roc_auc_score, output = ml_ana.classify(
            original_data=mc_data,
            target_data=real_data,
            validation=n_folds_scoring,
            plot_importance=4,
            plot_title="ROC AUC to distinguish data",
            clf=score_clf,
            weights_ratio=1,
            features=score_columns,
            extended_report=scoring,
            get_predictions=True)
        del tmp_
        # HACK
        predictions = output['y_proba'][:, 1][output['y_true'] == 0]
        weights_pred = np.log(output['weights'][output['y_true'] == 0])
        weights_pred = output['weights'][output['y_true'] == 0]
        out.figure("Correlation of weights and predictions")
        plt.scatter(predictions, weights_pred)
        plt.xlabel("predictions")
        plt.ylabel("weights")
        out.figure("Correlation of weights and predictions hexbin")
        plt.hexbin(x=predictions, y=weights_pred, gridsize=150)
        #        sns.jointplot(x=predictions, y=weights_pred, kind="hex")

        if mayou_score:
            metrics.mayou_score(mc_data=mc_data,
                                real_data=real_data,
                                n_folds=n_folds_scoring,
                                clf=score_clf,
                                old_mc_weights=old_weights)
    # an example to add output with the most importand parameters. The first
    # one can also be a single object instead of a list. do_print means
    # printing it also to the console instead of only saving it to the output
    # file. To_end is sometimes quite useful, as it prints (and saves) the
    # arguments at the end of the file. So the important results are possibly
    # printed to the end
        out.add_output(['ROC AUC score:', roc_auc_score],
                       importance=5,
                       title='ROC AUC of mc reweighted/real KFold',
                       to_end=True)

        out.add_output(['score:', scores['score'], "+-", scores['score_std']],
                       importance=5,
                       title='Train similar report',
                       to_end=True)
        if extended_train_similar:
            out.add_output([
                '\nScore_mc:', scores['score_mc'], "+-", scores['score_mc_std']
            ],
                           importance=5,
                           to_end=True)
        if scores.get('score_max', False):
            out.add_output([
                'score max:', scores['score_max'], "+-",
                scores['score_max_std']
            ],
                           importance=5,
                           to_end=True)
        if scores.get('score_mc_max', False):
            out.add_output([
                'score_mc_max:', scores['score_mc_max'], "+-",
                scores['score_mc_max_std']
            ],
                           importance=5,
                           to_end=True)

        if scores.get('score_shuffled', False):
            out.add_output([
                'score_shuffled:', scores['score_shuffled'], "+-",
                scores['score_shuffled_std']
            ],
                           importance=5,
                           to_end=True)
        output['train_similar'] = scores
        output['roc_auc'] = roc_auc_score

    output['weights'] = new_weights
    if not apply_weights:
        mc_data.set_weights(old_weights)

    if scoring:
        mc_data.set_targets(temp_mc_targets)
        real_data.set_targets(temp_real_targets)

    return output
Beispiel #3
0
def feature_exploration(original_data,
                        target_data,
                        features=None,
                        n_folds=10,
                        clf='xgb',
                        roc_auc='single',
                        extended_report=True):
    """Explore the features by getting the roc auc and their feature importance.

    An essential part is to have a rough idea of how discriminating the
    features are. A classifier is trained on each single feature and all
    together, correlations and feature importance are plottet if wanted.

    Parameters
    ----------
    original_data : |hepds_type|
        One dataset
    target_data : |hepds_type|
        The other dataset
    features : list(str, str, str,...)
        The features/branches/columns to explore
    n_folds : int > 1
        Number of folds to split the data into to do some training/testing and
        get an estimate for the feature importance.
    clf : str or dict or clf, see: :py:meth:`~raredecay.analysis.ml_analysis.make_clf()`
        The classifier you want to use.
    roc_auc : {'single', 'all', 'both'} or False
        Whether to make a training/testing with:

        - every single feature (-> n_feature times KFolded training)
        - all features together (-> one KFolded training)
        - both of the above
        - None of them (-> use *False*)
    extended_report : boolean
        If True, an extended report will be made including feature importance
        and more.

    """
    import raredecay.analysis.ml_analysis as ml_ana

    roc_auc_all = True if roc_auc in ('all', 'both') else False
    roc_auc_single = True if roc_auc in ('single', 'both') else False

    if features is not None:
        original_data = original_data.copy_storage(columns=features)
        target_data = target_data.copy_storage(columns=features)

    figure = "Plotting" + str(original_data.name) + " and " + str(
        target_data.name)
    if extended_report:
        original_data.plot(figure=figure, title=figure)
        target_data.plot(figure=figure)

    if roc_auc_all:
        ml_ana.classify(original_data,
                        target_data,
                        validation=n_folds,
                        extended_report=extended_report,
                        clf=clf,
                        curve_name="all features",
                        plot_title="ROC AUC of")

    features = original_data.columns if features is None else features

    output = {}
    out_temp = {}
    if roc_auc_single:
        for feature in features:
            title = "Feature exploration, ROC AUC only using " + str(feature)
            tmp_, score = ml_ana.classify(original_data,
                                          target_data,
                                          features=feature,
                                          curve_name="only using " +
                                          str(feature),
                                          clf=clf,
                                          validation=n_folds,
                                          extended_report=extended_report,
                                          plot_title=title,
                                          weights_ratio=1,
                                          plot_importance=2)
            del tmp_
            out_temp[feature] = score

    output['score'] = out_temp

    return output
Beispiel #4
0
def train_similar(mc_data, real_data, features=None, n_checks=10, n_folds=10,
                  clf='xgb', test_max=True, test_shuffle=True, test_mc=False,
                  old_mc_weights=1, test_predictions=False, clf_pred='rdf'):
    """Score for reweighting. Train clf on mc reweighted/real, test on real; minimize score.

    Enter two datasets and evaluate the score described below. Return a
    dictionary containing the different scores. The test_predictions is
    another scoring, which is built upon the train_similar method.

    **Scoring method description**

    **Idea**:
    A clf is trained on the reweighted mc as well as on the real data of a
    certain decay. Therefore, the classifier learns to distinguish between
    Monte-Carlo data and real data. Then we let the classifier predict some
    real data (an unbiased test set) and see, how many he is able to classify
    as real events. The lower the score, the less differences he was able to
    learn from the train data therefore the more similar the train data
    therefore the better the reweighting.

    **Advandages**: It is quite difficult to cheat on this method. Most of all
    it is robust to single high-weight events (which mcreweighted_as_real is
    not) and, in general, seems to be the best scoring so far.

    **Disadvantages**: If you insert a gaussian shaped 1.0 as mc and a gaussian
    shaped 1.1 as real, the score will be badly (around 0.33). So far, this was
    only observed for "artificial" distributions (even dough, of course, we
    do not know if it affects real distributions aswell partly)

    **Output explanation**

    The return is a dictionary containing several values. Of course, only the
    values, which are set to be evaluated, are contained. The keys are:

    - '**score**' : The average of all train_similar scores (as we use KFolding,
      there will be n_folds scores). *The* score.
    - '**score_std**' : The std of a single score, just for curiosity
    - '**score_max**' : The (average of all) "maximum" score. Actually the
      train_similar score but
      with mc instead of *reweighted* mc. Should be higher then the
      reweighted score.
    - '**score_max_std**' : The std of a single score, just for curiosity
    - '**score_pred**' : The score of the test_predictions method.
    - '**score_mc_pred**' : The score of the test_predictions method but on the
      predictions of the mc instead of the *reweighted* mc.

    Parameters
    ----------
    mc_data : HEPDataStorage
        The reweighted Monte-Carlo data, assuming the new weights are applied
        already.
    real_data : HEPDataStorage
        The real data
    n_checks : int >= 1
        Number of checks to perform. Has to be <= n_folds
    n_folds : int > 1
        Number of folds the data will be split into
    clf : str
        The name of a classifier to be used in
        :py:func:`~raredecay.analysis.ml_analysis.classify`.
    test_max : boolean
        If true, test for the "maximum value" by training also on mc/real
        (instead of *reweighted* mc/real)
        and test on real. The score for only mc should be higher than for
        reweighted mc/real. It *should* most probably but does not have to
        be!
    old_mc_weights : array-like or 1
        If *test_max* is True, the weights for mc before reweighting will be
        taken to be *old_mc_weights*, the weights the mc distribution had
        before the reweighting. The default is 1.
    test_predictions : boolean
        If true, try to distinguish the predictions. Advanced feature and not
        yet really discoverd how to interpret. Gives very high ROC somehow.
    clf_pred : str
        The classifier to be used to distinguish the predictions. Required for
        the *test_predictions*.

    Return
    ------
    out : dict
        A dictionary conaining the different scores. Description see above.

    """
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.globals_ import out

    # initialize variables
    assert 1 <= n_checks <= n_folds and n_folds > 1, "wrong n_checks/n_folds. Check the docs"
    assert isinstance(mc_data, data_storage.HEPDataStorage), \
        "mc_data wrong type:" + str(type(mc_data)) + ", has to be HEPDataStorage"
    assert isinstance(real_data, data_storage.HEPDataStorage), \
        "real_data wrong type:" + str(type(real_data)) + ", has to be HEPDataStorage"
#    assert isinstance(clf, str),\
#        "clf has to be a string, the name of a valid classifier. Check the docs!"

    output = {}

    scores = np.ones(n_checks)
    scores_shuffled = np.ones(n_checks)
    scores_mc = np.ones(n_checks)
    scores_max = np.ones(n_checks)  # required due to output of loop
    scores_mc_max = np.ones(n_checks)
#    scores_weighted = []
    scores_max_weighted = []
    probas_mc = []
    probas_reweighted = []
    weights_mc = []
    weights_reweighted = []

    real_pred = []
    real_test_index = []
    real_mc_pred = []

    # initialize data
    tmp_mc_targets = mc_data.get_targets()
    mc_data.set_targets(0)
    real_data.make_folds(n_folds=n_folds)
    if test_mc:
        mc_data.make_folds(n_folds=n_folds)
    for fold in range(n_checks):
        real_train, real_test = real_data.get_fold(fold)
        if test_mc:
            mc_train, mc_test = mc_data.get_fold(fold)
            mc_test.set_targets(0)
        else:
            mc_train = mc_data.copy_storage()
        mc_train.set_targets(0)

        real_test.set_targets(1)
        real_train.set_targets(1)

        tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                  plot_title="train on mc reweighted/real, test on real",
                                  weights_ratio=1, get_predictions=True,
                                  features=features,
                                  plot_importance=1, importance=1)
        clf_trained, scores[fold], pred_reweighted = tmp_out

        tmp_weights = mc_train.get_weights()

        if test_shuffle:
            import copy
            shuffled_weights = copy.deepcopy(tmp_weights)
            shuffled_weights.reindex(np.random.permutation(shuffled_weights.index))
            mc_train.set_weights(shuffled_weights)
            tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                      plot_title="train on mc reweighted/real, test on real",
                                      weights_ratio=1, get_predictions=True,
                                      features=features,
                                      plot_importance=1, importance=1)
            scores_shuffled[fold] = tmp_out[1]
            mc_train.set_weights(tmp_weights)

        if test_mc:
            clf_trained, scores_mc[fold] = ml_ana.classify(validation=mc_test,
                                                           clf=clf_trained,
                                                           plot_title="train on mc reweighted/real, test on mc",
                                                           weights_ratio=1, get_predictions=False,
                                                           features=features,
                                                           plot_importance=1,
                                                           importance=1)

#        del clf_trained, tmp_pred
        probas_reweighted.append(pred_reweighted['y_proba'])
        weights_reweighted.append(pred_reweighted['weights'])

        real_pred.extend(pred_reweighted['y_pred'])
        real_test_index.extend(real_test.get_index())

        if test_max:
            temp_weights = mc_data.get_weights()
            mc_data.set_weights(old_mc_weights)
            tmp_out = ml_ana.classify(mc_data, real_train, validation=real_test,
                                      plot_title="real/mc NOT reweight trained, validate on real",
                                      weights_ratio=1, get_predictions=True, clf=clf,
                                      features=features,
                                      plot_importance=1, importance=1)
            clf_trained, scores_max[fold], pred_mc = tmp_out
            if test_mc:
                clf_trained, scores_mc_max[fold] = ml_ana.classify(validation=mc_test, clf=clf_trained,
                                                                   plot_title="train on mc NOT reweighted/real, test on mc",
                                                                   weights_ratio=1,
                                                                   get_predictions=False,
                                                                   features=features,
                                                                   plot_importance=1,
                                                                   importance=1)
            del clf_trained
# HACK
            tmp_pred = pred_mc['y_proba'][:, 1] * pred_mc['weights']
            scores_max_weighted.extend(tmp_pred * (pred_mc['y_true'] * 2 - 1))

# HACK END
            mc_data.set_weights(temp_weights)
            probas_mc.append(pred_mc['y_proba'])
            weights_mc.append(pred_mc['weights'])

            real_mc_pred.extend(pred_mc['y_pred'])

    output['score'] = np.round(scores.mean(), 4)
    output['score_std'] = np.round(scores.std(), 4)

    if test_shuffle:
        output['score_shuffled'] = np.round(scores_shuffled.mean(), 4)
        output['score_shuffled_std'] = np.round(scores_shuffled.std(), 4)

    if test_mc:
        output['score_mc'] = np.round(scores_mc.mean(), 4)
        output['score_mc_std'] = np.round(scores_mc.std(), 4)

    out.add_output(["Score train_similar (recall, lower means better): ",
                   str(output['score']) + " +- " + str(output['score_std'])],
                   subtitle="Clf trained on real/mc reweight, tested on real")
    if test_max:
        output['score_max'] = np.round(scores_max.mean(), 4)
        output['score_max_std'] = np.round(scores_max.std(), 4)
        if test_mc:
            output['score_mc_max'] = np.round(scores_mc_max.mean(), 4)
            output['score_mc_max_std'] = np.round(scores_mc_max.std(), 4)
        out.add_output(["No reweighting score: ", round(output['score_max'], 4)])

    if test_predictions:
        # test on the reweighted/real predictions
        real_data.set_targets(targets=real_pred, index=real_test_index)
        tmp_, score_pred = ml_ana.classify(real_data, target_from_data=True, clf=clf_pred,
                                           features=features,
                                           plot_title="train on predictions reweighted/real, real as target",
                                           weights_ratio=1, validation=n_checks, plot_importance=3)
        output['score_pred'] = round(score_pred, 4)

    if test_predictions and test_max:
        # test on the mc/real predictions
        real_data.set_targets(targets=real_mc_pred, index=real_test_index)
        tmp_, score_mc_pred = ml_ana.classify(real_data, target_from_data=True, clf=clf_pred,
                                              validation=n_checks,
                                              plot_title="mc not rew/real pred, real as target",
                                              weights_ratio=1, plot_importance=3)
        output['score_mc_pred'] = np.round(score_mc_pred, 4)

    mc_data.set_targets(tmp_mc_targets)

    output['similar_dist'] = similar_dist(predictions=np.concatenate(probas_reweighted)[:, 1],
                                          weights=np.concatenate(weights_reweighted))

    return output
Beispiel #5
0
def mayou_score(mc_data, real_data, features=None, old_mc_weights=1,
                clf='xgb', splits=2, n_folds=10):
    """An experimental score using a "loss" function for data-similarity"""
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.globals_ import out

    # initialize variables
    output = {}
    score_mc_vs_mcr = []
    score_mcr_vs_real = []
#    splits *= 2  # because every split is done with fold 0 and 1 (<- 2 *)

    # loop over number of splits, split the mc data

    mc_data.make_folds(n_folds)
    real_data.make_folds(n_folds)

    # mc reweighted vs mc
    for fold in xrange(n_folds):
        mc_data_train, mc_data_test = mc_data.get_fold(fold)
        # TODO: no real folds? It is better to test on full data always?
#        mc_data_train, mc_data_test = real_data.get_fold(fold)
        for split in xrange(splits * 2):  # because two possibilities per split
            if split % 2 == 0:
                mc_data_train.make_folds(2)
            mc_normal, mc_reweighted = mc_data_train.get_fold(split % 2)
            mc_normal.set_weights(old_mc_weights)
            score_mc_vs_mcr.append(ml_ana.classify(original_data=mc_normal,
                                                   target_data=mc_reweighted,
                                                   features=features,
                                                   validation=[mc_data_test, real_data],
                                                   clf=clf, plot_importance=1,
                                                   # TODO: no weights ratio? (roc auc)
                                                   weights_ratio=0
                                                   )[1])
    out.add_output(["mayou_score mc vs mc reweighted test on mc vs real score: ",
                    score_mc_vs_mcr, "\nMean: ", np.mean(score_mc_vs_mcr),
                    " +-", np.std(score_mc_vs_mcr) / mt.sqrt(len(score_mc_vs_mcr) - 1)],
                   subtitle="Mayou score", to_end=True)

    output['mc_distance'] = np.mean(score_mc_vs_mcr)

    # mc_reweighted vs real
    for fold in xrange(n_folds):
        real_train, real_test = real_data.get_fold(fold)
        mc_train, mc_test = mc_data.get_fold(fold)
        mc_test.set_weights(old_mc_weights)
        score_mcr_vs_real.append(ml_ana.classify(original_data=mc_train,
                                                 target_data=real_train,
                                                 features=features,
                                                 validation=[mc_test, real_test],
                                                 clf=clf, plot_importance=1,
                                                 # TODO: no weights ratio? (roc auc)
                                                 weights_ratio=0
                                                 )[1])

    out.add_output(["mayou_score real vs mc reweighted test on mc vs real score: ",
                    score_mcr_vs_real, "\nMean: ", np.mean(score_mcr_vs_real),
                    " +-", np.std(score_mcr_vs_real) / mt.sqrt(len(score_mcr_vs_real) - 1)],
                   to_end=True)

    output['real_distance'] = np.mean(score_mcr_vs_real)
Beispiel #6
0
def train_similar_new(mc, real, columns=None, n_checks=10, n_folds=10, clf='xgb', test_max=True,
                      test_shuffle=True, test_mc=False, old_mc_weights=1, test_predictions=False,
                      clf_pred='rdf'):
    """Score for reweighting. Train clf on mc reweighted/real, test on real; minimize score.

    Enter two datasets and evaluate the score described below. Return a
    dictionary containing the different scores. The test_predictions is
    another scoring, which is built upon the train_similar method.

    **Scoring method description**

    **Idea**:
    A clf is trained on the reweighted mc as well as on the real data of a
    certain decay. Therefore, the classifier learns to distinguish between
    Monte-Carlo data and real data. Then we let the classifier predict some
    real data (an unbiased test set) and see, how many he is able to classify
    as real events. The lower the score, the less differences he was able to
    learn from the train data therefore the more similar the train data
    therefore the better the reweighting.

    **Advandages**: It is quite difficult to cheat on this method. Most of all
    it is robust to single high-weight events (which mcreweighted_as_real is
    not) and, in general, seems to be the best scoring so far.

    **Disadvantages**: If you insert a gaussian shaped 1.0 as mc and a gaussian
    shaped 1.1 as real, the score will be badly (around 0.33). So far, this was
    only observed for "artificial" distributions (even dough, of course, we
    do not know if it affects real distributions aswell partly)

    **Output explanation**

    The return is a dictionary containing several values. Of course, only the
    values, which are set to be evaluated, are contained. The keys are:

    - '**score**' : The average of all train_similar scores (as we use KFolding,
      there will be n_folds scores). *The* score.
    - '**score_std**' : The std of a single score, just for curiosity
    - '**score_max**' : The (average of all) "maximum" score. Actually the
      train_similar score but
      with mc instead of *reweighted* mc. Should be higher then the
      reweighted score.
    - '**score_max_std**' : The std of a single score, just for curiosity
    - '**score_pred**' : The score of the test_predictions method.
    - '**score_mc_pred**' : The score of the test_predictions method but on the
      predictions of the mc instead of the *reweighted* mc.

    Parameters
    ----------
    mc : |hepds_type|
        The reweighted Monte-Carlo data, assuming the new weights are applied
        already.
    real : |hepds_type|
        The real data
    n_checks : int >= 1
        Number of checks to perform. Has to be <= n_folds
    n_folds : int > 1
        Number of folds the data will be split into
    clf : str
        The name of a classifier to be used in
        :py:func:`~raredecay.analysis.ml_analysis.classify`.
    test_max : boolean
        If true, test for the "maximum value" by training also on mc/real
        (instead of *reweighted* mc/real)
        and test on real. The score for only mc should be higher than for
        reweighted mc/real. It *should* most probably but does not have to
        be!
    old_mc_weights : array-like or 1
        If *test_max* is True, the weights for mc before reweighting will be
        taken to be *old_mc_weights*, the weights the mc distribution had
        before the reweighting. The default is 1.
    test_predictions : boolean
        If true, try to distinguish the predictions. Advanced feature and not
        yet really discoverd how to interpret. Gives very high ROC somehow.
    clf_pred : str
        The classifier to be used to distinguish the predictions. Required for
        the *test_predictions*.

    Return
    ------
    out : dict
        A dictionary conaining the different scores. Description see above.

    """
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.tools.data_storage import HEPDataStorage
    from raredecay.analysis import statistics

    # Python 2/3 compatibility, str
    columns = dev_tool.entries_to_str(columns)
    clf = dev_tool.entries_to_str(clf)
    clf_pred = dev_tool.entries_to_str(clf_pred)

    # initialize variables
    assert 1 <= n_checks <= n_folds and n_folds > 1, "wrong n_checks/n_folds. Check the docs"
    assert isinstance(mc, data_storage.HEPDataStorage), \
        "mc_data wrong type:" + str(type(mc)) + ", has to be HEPDataStorage"
    assert isinstance(real, data_storage.HEPDataStorage), \
        "real_data wrong type:" + str(type(real)) + ", has to be HEPDataStorage"
    #    assert isinstance(clf, str),\
    #        "clf has to be a string, the name of a valid classifier. Check the docs!"

    output = {}

    predictions = []
    predictions_weights = []
    predictions_max = []
    predictions_max_weights = []
    predictions_min = []
    predictions_min_weights = []

    # initialize data
    tmp_mc_targets = mc.get_targets()
    mc.set_targets(0)
    real.make_folds(n_folds=n_folds)

    for fold in range(n_checks):
        real_train, real_test = real.get_fold(fold)
        mc_train = mc.copy_storage()
        mc_train.set_targets(0)
        real_test.set_targets(1)
        real_train.set_targets(1)

        tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                  plot_title="train on mc reweighted/real, test on real",
                                  weights_ratio=1, get_predictions=True,
                                  features=columns,
                                  plot_importance=1, importance=1)
        clf_trained, _, pred = tmp_out

        predictions.append(pred['y_proba'][:, 1])
        predictions_weights.append(pred['weights'])

        temp_weights = mc_train.weights
        mc_train.set_weights(old_mc_weights)
        tmp_out = ml_ana.classify(original_data=mc_train, target_data=real_train, validation=real_test,
                                  plot_title="real/mc NOT reweight trained, validate on real",
                                  weights_ratio=1, get_predictions=True, clf=clf,
                                  features=columns,
                                  plot_importance=1, importance=1)
        clf_trained, _, pred = tmp_out
        predictions_max.append(pred['y_proba'][:, 1])
        predictions_max_weights.append(pred['weights'])
        mc_train.set_weights(temp_weights)

    predictions = np.concatenate(predictions)
    predictions_weights = np.concatenate(predictions_weights)
    predictions_max = np.concatenate(predictions_max)
    predictions_max_weights = np.concatenate(predictions_max_weights)

    # mix mc and real to get a nice shape of two similar dists
    # TODO: commented below out
    mc.set_weights(old_mc_weights)
    mc.make_folds(2)
    real.make_folds(2)
    mc1, mc2 = mc.get_fold(0)
    real1, real2 = real.get_fold(0)

    data1, target1, weights1 = mc1.make_dataset(real1)
    data2, target2, weights2 = mc2.make_dataset(real2)

    data1 = HEPDataStorage(data=data1, sample_weights=weights1, target=0)
    data2 = HEPDataStorage(data=data2, sample_weights=weights2, target=1)

    tmp_out = ml_ana.classify(original_data=data1, target_data=data2, validation=n_folds,
                              plot_title="real/mc mixed",
                              weights_ratio=1, get_predictions=True, clf=clf,
                              features=columns,
                              plot_importance=1, importance=1)
    clf_trained, _, pred = tmp_out
    predictions_min = np.array(pred['y_proba'][:, 1])
    predictions_min_weights = np.array(pred['weights'])

    mc.set_weights(temp_weights)
    mc.set_targets(tmp_mc_targets)

    # HACK
    import matplotlib.pyplot as plt
    n_bins = 20
    plt.figure("comparing the predictions")
    plt.hist(predictions, alpha=0.3, label="predictions", bins=n_bins, density=1)
    plt.hist(predictions_min, alpha=0.3, label="predictions_min", bins=n_bins, density=1)
    plt.hist(predictions_max, alpha=0.3, label="predictions_max", bins=n_bins, density=1)
    plt.legend()
    # plt.autoscale()

    output['similar_ks_minimize'] = statistics.ks_2samp(predictions, predictions_min,
                                                        weights1=predictions_weights,
                                                        weights2=predictions_min_weights)
    output['similar_ks_max'] = statistics.ks_2samp(predictions_max, predictions_min,
                                                   weights1=predictions_max_weights,
                                                   weights2=predictions_min_weights)
    output['similar_ks_maximize'] = statistics.ks_2samp(predictions, predictions_max,
                                                        weights1=predictions_weights,
                                                        weights2=predictions_max_weights)

    return output