Ejemplo n.º 1
0
    def _create_ds(self):
        tmp_data_for_hepds = self.data_for_hepds * 2
        tmp_data_for_hepds.set_index(
            [list(range(100, 100 + len(tmp_data_for_hepds)))], inplace=True)
        tmp_data_for_hepds3 = copy.deepcopy(tmp_data_for_hepds)
        tmp_data_for_hepds3.set_index(
            [list(range(200, 200 + len(tmp_data_for_hepds3)))], inplace=True)

        data_tmp = pd.concat(
            [tmp_data_for_hepds, self.data_for_hepds, tmp_data_for_hepds3],
            axis=0)

        ds_tmp = HEPDataStorage(
            data_tmp,
            target=3 * list(self.target_for_hepds),
            sample_weights=np.concatenate(
                [self.weights_for_hepds for _ in range(3)]),
            # index=self.truth_index,  # NO index, because it is saved sorted
            data_name=self.truth_name,
            data_name_addition=self.truth_name_addition)

        ds_tmp.make_folds(3, shuffle=False)
        ds_tmp = ds_tmp.get_fold(1)

        return ds_tmp[1]
Ejemplo n.º 2
0
 def _create_ds(self):
     ds_tmp = HEPDataStorage(
         self.data_for_hepds,
         target=self.target_for_hepds,
         sample_weights=self.weights_for_hepds,
         # index=self.truth_index,  # NO index, because it is saved sorted
         data_name=self.truth_name,
         data_name_addition=self.truth_name_addition)
     ds_tmp.set_data(self.data_for_hepds)
     return ds_tmp
Ejemplo n.º 3
0
 def _create_ds(self):
     return HEPDataStorage(self.data_for_hepds,
                           target=self.target_for_hepds,
                           sample_weights=self.weights_for_hepds,
                           index=self.truth_index,
                           data_name=self.truth_name,
                           data_name_addition=self.truth_name_addition)
Ejemplo n.º 4
0
def test_root_storage():

    # create root-file
    while True:
        tmp_str = ''.join(random.choice(string.ascii_letters + string.digits) for _t in range(15))
        filename = 'tmp1' + tmp_str + '.root'
        if not os.path.isfile(filename):
            break
    treename = 'tree1'

    df1 = create_data()

    for name, col in df1.iteritems():
        add_branch_to_rootfile(filename=filename, treename=treename,
                               new_branch=col, branch_name=name)

    root_dict = dict(filenames=filename, treename=treename, branches=branches)
    weights1 = create_weights()
    storage1 = HEPDataStorage(data=root_dict, target=1, sample_weights=weights1)

    # start testing
    pandasDF(storage1)

    # remove root-file at the end
    os.remove(filename)
Ejemplo n.º 5
0
def _create_data(n_storages=3):
    data_storages = []

    for i in range(n_storages):
        data = pd.DataFrame(np.random.normal(0.3 * i, 10 + i, size=[50, 4]),
                            columns=all_branches)
        weights = np.random.normal(size=len(data))
        data_storages.append(
            HEPDataStorage(data,
                           target=i % 2,
                           sample_weights=weights,
                           data_name='test storage ' + str(i)))

    return data_storages

    if __name__ == '__main__':
        unittest.main()
Ejemplo n.º 6
0
                        columns=['x', 'y', 'pred'])
    #    data['pred'] = np.array([min((abs(y), 0.99)) for y in np.random.normal(loc=0.6, scale=0.25, size=n_sig)])
    bkg_data = np.array([
        i for i in (np.random.exponential(scale=300, size=(7500, 3)) + 4800)
        if i[0] < 6000
    ])
    bkg_data[:, 2] = np.array([
        min((abs(y), 0.96))
        for y in np.random.normal(loc=0.4, scale=0.4, size=len(bkg_data))
    ])
    data = pd.concat(
        [data, pd.DataFrame(bkg_data, columns=['x', 'y', 'pred'])],
        ignore_index=True)

    data = HEPDataStorage(data,
                          target=np.concatenate(
                              (np.ones(n_sig), np.zeros(len(bkg_data)))))
    data_copy = data.copy_storage()

    if mode == 'fit':
        fit_result = fit_mass(
            data=data,
            column='x',
            sig_pdf=doubleCB,
            x=x,
            bkg_pdf=bkg_pdf,
            blind=False,
            #                                       blind=(5100, 5380),
            plot_importance=4,  #bkg_in_region=(5100, 5380)
        )
        print fit_result
Ejemplo n.º 7
0
    if scoring:
        mc_data.set_targets(temp_mc_targets)
        real_data.set_targets(temp_real_targets)

    return output


# temporary:
if __name__ == '__main__':
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt

    from raredecay.tools.data_storage import HEPDataStorage
    n_cols = 7
    cols = [str(i) for i in range(n_cols)]
    a = pd.DataFrame(np.random.normal(loc=0, scale=1, size=(1000, n_cols)),
                     columns=cols)
    a = HEPDataStorage(
        a,
        target=1,
    )
    b = pd.DataFrame(np.random.normal(loc=0.2, scale=1.3, size=(1000, n_cols)),
                     columns=cols)
    b = HEPDataStorage(b, target=0)

    #    feature_exploration(a, b, n_folds=3, roc_auc='all')

    plt.show()
Ejemplo n.º 8
0
]

# ==============================================================================
# Create the data
# ==============================================================================
DATA_PATH = '/home/decay-data/'  # TODO: set your path to the data (or leave away)

# TODO: set your data
real_data_root = dict(filenames=DATA_PATH + 'B2KpiLL-sWeights.root',
                      treename='DecayTree',
                      branches=all_branches)

# TODO: set the name and weights of your data
real_data = HEPDataStorage(
    data=real_data_root,
    sample_weights='signal_sw',  # takes the branch 'signal_sw' as weights
    data_name="Real data",
    data_name_addition="cut")
# TODO: same as above
mc_data = dict(filenames=DATA_PATH + 'Bu2K1Jpsi-mm-Sim08g.root',
               treename='DecayTree',
               branches=all_branches)
mc_data = HEPDataStorage(data=mc_data,
                         data_name="MC",
                         data_name_addition="cut")

# TODO: same as above. Apply data is the MC which you want to be reweighted
apply_data = dict(filenames=DATA_PATH + 'Bu2K1Jpsi-ee.root',
                  treename='DecayTree',
                  branches=all_branches)
apply_data = HEPDataStorage(data=mc_data,
Ejemplo n.º 9
0
 def _create_ds2(self):
     return HEPDataStorage(self.data_for_hepds2,
                           target=self.target_for_hepds2,
                           sample_weights=self.weights_for_hepds2,
                           data_name=self.truth_name2,
                           data_name_addition=self.truth_name_addition2)
Ejemplo n.º 10
0
def train_similar_new(mc, real, columns=None, n_checks=10, n_folds=10, clf='xgb', test_max=True,
                      test_shuffle=True, test_mc=False, old_mc_weights=1, test_predictions=False,
                      clf_pred='rdf'):
    """Score for reweighting. Train clf on mc reweighted/real, test on real; minimize score.

    Enter two datasets and evaluate the score described below. Return a
    dictionary containing the different scores. The test_predictions is
    another scoring, which is built upon the train_similar method.

    **Scoring method description**

    **Idea**:
    A clf is trained on the reweighted mc as well as on the real data of a
    certain decay. Therefore, the classifier learns to distinguish between
    Monte-Carlo data and real data. Then we let the classifier predict some
    real data (an unbiased test set) and see, how many he is able to classify
    as real events. The lower the score, the less differences he was able to
    learn from the train data therefore the more similar the train data
    therefore the better the reweighting.

    **Advandages**: It is quite difficult to cheat on this method. Most of all
    it is robust to single high-weight events (which mcreweighted_as_real is
    not) and, in general, seems to be the best scoring so far.

    **Disadvantages**: If you insert a gaussian shaped 1.0 as mc and a gaussian
    shaped 1.1 as real, the score will be badly (around 0.33). So far, this was
    only observed for "artificial" distributions (even dough, of course, we
    do not know if it affects real distributions aswell partly)

    **Output explanation**

    The return is a dictionary containing several values. Of course, only the
    values, which are set to be evaluated, are contained. The keys are:

    - '**score**' : The average of all train_similar scores (as we use KFolding,
      there will be n_folds scores). *The* score.
    - '**score_std**' : The std of a single score, just for curiosity
    - '**score_max**' : The (average of all) "maximum" score. Actually the
      train_similar score but
      with mc instead of *reweighted* mc. Should be higher then the
      reweighted score.
    - '**score_max_std**' : The std of a single score, just for curiosity
    - '**score_pred**' : The score of the test_predictions method.
    - '**score_mc_pred**' : The score of the test_predictions method but on the
      predictions of the mc instead of the *reweighted* mc.

    Parameters
    ----------
    mc : |hepds_type|
        The reweighted Monte-Carlo data, assuming the new weights are applied
        already.
    real : |hepds_type|
        The real data
    n_checks : int >= 1
        Number of checks to perform. Has to be <= n_folds
    n_folds : int > 1
        Number of folds the data will be split into
    clf : str
        The name of a classifier to be used in
        :py:func:`~raredecay.analysis.ml_analysis.classify`.
    test_max : boolean
        If true, test for the "maximum value" by training also on mc/real
        (instead of *reweighted* mc/real)
        and test on real. The score for only mc should be higher than for
        reweighted mc/real. It *should* most probably but does not have to
        be!
    old_mc_weights : array-like or 1
        If *test_max* is True, the weights for mc before reweighting will be
        taken to be *old_mc_weights*, the weights the mc distribution had
        before the reweighting. The default is 1.
    test_predictions : boolean
        If true, try to distinguish the predictions. Advanced feature and not
        yet really discoverd how to interpret. Gives very high ROC somehow.
    clf_pred : str
        The classifier to be used to distinguish the predictions. Required for
        the *test_predictions*.

    Return
    ------
    out : dict
        A dictionary conaining the different scores. Description see above.

    """
    import raredecay.analysis.ml_analysis as ml_ana
    from raredecay.tools.data_storage import HEPDataStorage
    from raredecay.analysis import statistics

    # Python 2/3 compatibility, str
    columns = dev_tool.entries_to_str(columns)
    clf = dev_tool.entries_to_str(clf)
    clf_pred = dev_tool.entries_to_str(clf_pred)

    # initialize variables
    assert 1 <= n_checks <= n_folds and n_folds > 1, "wrong n_checks/n_folds. Check the docs"
    assert isinstance(mc, data_storage.HEPDataStorage), \
        "mc_data wrong type:" + str(type(mc)) + ", has to be HEPDataStorage"
    assert isinstance(real, data_storage.HEPDataStorage), \
        "real_data wrong type:" + str(type(real)) + ", has to be HEPDataStorage"
    #    assert isinstance(clf, str),\
    #        "clf has to be a string, the name of a valid classifier. Check the docs!"

    output = {}

    predictions = []
    predictions_weights = []
    predictions_max = []
    predictions_max_weights = []
    predictions_min = []
    predictions_min_weights = []

    # initialize data
    tmp_mc_targets = mc.get_targets()
    mc.set_targets(0)
    real.make_folds(n_folds=n_folds)

    for fold in range(n_checks):
        real_train, real_test = real.get_fold(fold)
        mc_train = mc.copy_storage()
        mc_train.set_targets(0)
        real_test.set_targets(1)
        real_train.set_targets(1)

        tmp_out = ml_ana.classify(mc_train, real_train, validation=real_test, clf=clf,
                                  plot_title="train on mc reweighted/real, test on real",
                                  weights_ratio=1, get_predictions=True,
                                  features=columns,
                                  plot_importance=1, importance=1)
        clf_trained, _, pred = tmp_out

        predictions.append(pred['y_proba'][:, 1])
        predictions_weights.append(pred['weights'])

        temp_weights = mc_train.weights
        mc_train.set_weights(old_mc_weights)
        tmp_out = ml_ana.classify(original_data=mc_train, target_data=real_train, validation=real_test,
                                  plot_title="real/mc NOT reweight trained, validate on real",
                                  weights_ratio=1, get_predictions=True, clf=clf,
                                  features=columns,
                                  plot_importance=1, importance=1)
        clf_trained, _, pred = tmp_out
        predictions_max.append(pred['y_proba'][:, 1])
        predictions_max_weights.append(pred['weights'])
        mc_train.set_weights(temp_weights)

    predictions = np.concatenate(predictions)
    predictions_weights = np.concatenate(predictions_weights)
    predictions_max = np.concatenate(predictions_max)
    predictions_max_weights = np.concatenate(predictions_max_weights)

    # mix mc and real to get a nice shape of two similar dists
    # TODO: commented below out
    mc.set_weights(old_mc_weights)
    mc.make_folds(2)
    real.make_folds(2)
    mc1, mc2 = mc.get_fold(0)
    real1, real2 = real.get_fold(0)

    data1, target1, weights1 = mc1.make_dataset(real1)
    data2, target2, weights2 = mc2.make_dataset(real2)

    data1 = HEPDataStorage(data=data1, sample_weights=weights1, target=0)
    data2 = HEPDataStorage(data=data2, sample_weights=weights2, target=1)

    tmp_out = ml_ana.classify(original_data=data1, target_data=data2, validation=n_folds,
                              plot_title="real/mc mixed",
                              weights_ratio=1, get_predictions=True, clf=clf,
                              features=columns,
                              plot_importance=1, importance=1)
    clf_trained, _, pred = tmp_out
    predictions_min = np.array(pred['y_proba'][:, 1])
    predictions_min_weights = np.array(pred['weights'])

    mc.set_weights(temp_weights)
    mc.set_targets(tmp_mc_targets)

    # HACK
    import matplotlib.pyplot as plt
    n_bins = 20
    plt.figure("comparing the predictions")
    plt.hist(predictions, alpha=0.3, label="predictions", bins=n_bins, density=1)
    plt.hist(predictions_min, alpha=0.3, label="predictions_min", bins=n_bins, density=1)
    plt.hist(predictions_max, alpha=0.3, label="predictions_max", bins=n_bins, density=1)
    plt.legend()
    # plt.autoscale()

    output['similar_ks_minimize'] = statistics.ks_2samp(predictions, predictions_min,
                                                        weights1=predictions_weights,
                                                        weights2=predictions_min_weights)
    output['similar_ks_max'] = statistics.ks_2samp(predictions_max, predictions_min,
                                                   weights1=predictions_max_weights,
                                                   weights2=predictions_min_weights)
    output['similar_ks_maximize'] = statistics.ks_2samp(predictions, predictions_max,
                                                        weights1=predictions_weights,
                                                        weights2=predictions_max_weights)

    return output