def preSum(train_dir, particle=""):
    '''
    To be used before training for data visualization.
    Naive sum of the shower deposits in the ECAL and HCAL.
    :type train_dir: str.
    :parameter train_dir: path to the training directory with HDF5 files.
    :type particle: str.
    :parameter particle: name of the particle.
    :return: energy targets and energy sum arrays.
    :rtype: numpy.ndarray, numpy.ndarray; shape: (n,) shape: (n,)
    '''
    # grab targets (y)
    all_y = simple_grab('Y',
                        data=train_dir,
                        label_keys='target',
                        input_keys=['ECAL', 'HCAL'])
    all_y = all_y[:, 1:]
    all_y = all_y.ravel()
    #print(all_y.shape)

    # sum of ECAL and HCAL
    inSum = inpSum(train_dir)

    # save arrays to HDF5
    saveSum_toHDF5(particle, all_y, inSum)

    return all_y, inSum
Example #2
0
def get_roc_data(**kargs):
    '''Get ROC curve **tpr**, **fpr**, **thresholds**, and **auc** from labels and predictions.
        Takes all arguments availiable to :py:func:`CMS_Deep_Learning.io.simple_grab`
        
        :param ROC_data: a tuple (fpr, tpr,thres,roc_auc) containing the roc parametrization and the auc
        :param *: Any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions**
        '''
    inp = kargs
    if ("ROC_data" in inp):
        fpr, tpr, thres, roc_auc = inp["ROC_data"]
    else:
        from sklearn.metrics import roc_curve, auc
        labels, predictions = simple_grab(['Y', 'predictions'], **inp)
        true_class_index = kargs.get("true_class_index", None)

        assert labels.shape == predictions.shape, "labels and predictions should have \
            the same shape, %r != %r" % (labels.shape, predictions.shape)
        n = labels.shape[0]
        if (len(labels.shape) > 1 and labels.shape[1] > 1):
            if (true_class_index != None):
                labels = labels[:, true_class_index].ravel()
                predictions = predictions[:, true_class_index].ravel()
            else:
                raise ValueError(
                    "must designate index of true class for data of shape %r" %
                    list(labels.shape))

        fpr, tpr, thres = roc_curve(labels, predictions)
        roc_auc = auc(fpr, tpr)
    return fpr, tpr, thres, roc_auc
def inpSum(dir):
    '''
    Naive sum of the shower deposits in the ECAL and HCAL.
    :type dir: str.
    :param dir: path to the directory with HDF5 files.
    :return: sum of the ECAL and HCAL sums.
    :rtype: numpy.ndarray, shape: (n,)
    '''
    # grab ECAL and HCAL inputs
    ecal, hcal = simple_grab('X',
                             data=dir,
                             label_keys=['ECAL', 'HCAL'],
                             input_keys=['ECAL', 'HCAL'])

    # sums
    s_ecal = sumCal(ecal)
    s_hcal = sumCal(hcal)

    # reshape sum output
    s_ecal = s_ecal.ravel()
    s_hcal = s_hcal.ravel()

    # total sum
    inSum = s_ecal + s_hcal
    return inSum
def grab(direc):
    y_ele = simple_grab('Y',
                        data=direc,
                        label_keys='target',
                        input_keys=['ECAL', 'HCAL'])
    y_ele = y_ele[:, 1:]
    y_ele = y_ele.ravel()
    ecal_ele, hcal_ele = simple_grab('X',
                                     data=direc,
                                     label_keys=['ECAL', 'HCAL'],
                                     input_keys=['ECAL', 'HCAL'])
    s_ecal_ele = sumCal(ecal_ele)
    s_hcal_ele = sumCal(hcal_ele)
    s_ecal_ele = s_ecal_ele.ravel()
    s_hcal_ele = s_hcal_ele.ravel()

    return y_ele, s_ecal_ele, s_hcal_ele
Example #5
0
def grab_ch(direc):
    # grab targets (y) for EleEscan
    y_ele = simple_grab('Y',
                        data=direc,
                        label_keys='good_target',
                        input_keys=['good_ECAL', 'good_HCAL'])
    y_ele = y_ele[:, 1:]
    y_ele = y_ele.ravel()

    # grab and calculate values for regression model
    ecal_ele, hcal_ele = simple_grab('X',
                                     data=direc,
                                     label_keys=['good_ECAL', 'good_HCAL'],
                                     input_keys=['good_ECAL', 'good_HCAL'])

    s_ecal_ele = sumCal(ecal_ele)
    s_hcal_ele = sumCal(hcal_ele)

    s_ecal_ele = s_ecal_ele.ravel()
    s_hcal_ele = s_hcal_ele.ravel()

    return y_ele, s_ecal_ele, s_hcal_ele
Example #6
0
def get_roc_points(tpr=[],
                   fpr=[],
                   thresh=[],
                   class_fprs={},
                   class_labels=None,
                   suppress_warnings=False,
                   verbose=0,
                   **kargs):
    '''Finds the tpr,fpr, and threshold holding one of them constant.

        :param tpr: a list of true positive rates to hold constant
        :param fpr: a list of false positive rates to hold constant
        :param class_fprs: a dictionary keyed by class index of false positive rates for each false class to keep constant
        :param thresh: a list of thesholds to hold constant
        :param *: Any argument available to :py:func:`CMS_Deep_Learning.postprocessing.metrics.get_roc_data` to get **ROC_data**,
                    and by extension any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions**

        :returns: a list of lists of tuples correspondind to the ROC points evaluated for the different trials
    '''
    # --------------------- Grabbing Data -------------------------
    try:
        kargs["Y"], kargs["predictions"] = simple_grab(['Y', 'predictions'],
                                                       **kargs)
    except Exception as e:
        if (verbose > 0): print(e)
    roc_data = get_roc_data(**kargs)
    _fpr, _tpr, _thresh, auc = roc_data

    # ------------------------------------------------------------

    # --------------------Decompose contamination by class---------------------
    if ("Y" in kargs and "predictions" in kargs
            and "true_class_index" in kargs):
        separated_conts = get_class_fprs(kargs["Y"], kargs["predictions"],
                                         _thresh, kargs["true_class_index"])
    elif (not suppress_warnings):
        import warnings
        warnings.warn("Cannot compute CLASS CONTAMINATIONS unless user inputs necessary data " + \
                      "for computing Y and predictions, in addition to true_class_index")

    # -------------------------------------------------------------------

    # ------------------------Find the closest points-------------------
    def indxClosest(target, lst):
        index, elmt = min(enumerate(lst), key=lambda x: abs(x[1] - target))
        return index

    indicies = []
    indicies += [indxClosest(y, _fpr) for y in fpr]
    indicies += [indxClosest(y, _tpr) for y in tpr]
    indicies += [indxClosest(y, _thresh) for y in thresh]
    for key, val in class_fprs.items():
        indicies += [indxClosest(y, separated_conts[key]) for y in val]

    fpr, tpr, thresh = _fpr[indicies], _tpr[indicies], _thresh[indicies]
    out = {"tpr": tpr, "fpr": fpr, "thresh": thresh}
    for j, val in separated_conts.items():
        label = class_labels[j] if class_labels != None else str(j)
        label = "fpr:" + label
        out[label] = val[indicies]

    # -----------------------------------------------------------------------
    return out
Example #7
0
def bin_metric_vs_char(args=[],
                       nb_bins=20,
                       equalBins=False,
                       plot=False,
                       **kargs):
    '''Computes event features and and returns binned data about the accuracy of a model against those features. Also computes the standard error for each bin.

        :param nb_bins: The number of bins to use in the analysis.
        :param equalBins: True/False, Defualt False. If True, will try to put an equal number of samples in each bin. This should probably be left False or else the bins
                        will be very unusual, varying significantly in their domain.
        :param plot: If True plot the bins automatically.
        :type plot: bool
        :param bins: A list of dictionaries outputted by CMS_Deep_Learning.postprocessing.metrics.bin_metric_vs_char
        :type bins: list of dict
        :param threshold: The threshold for the classifier for the True class. All other classes are collectively the False class.
        :type threshold: float
        :param true_class_index: The index in the output vector corresponding to the True class element.All other classes are collectively the False class.
        :type true_class_index: int
        :param *: Any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions**, **characteristics**

        :returns: A list of dictionaries each containing information about a bin. The output of this can be plotted with CMS_Deep_Learning.postprocessing.plot.plot_bins
            '''

    inputs = args
    if (len(args) == 0):
        inputs = [kargs]
    else:
        raise NotImplementedError("Have not written to take multiple inputs")
    inp = inputs[0]

    if (not isinstance(inp.get('characteristics', None), type(None))
            or not isinstance(inp.get('accumulate', None), type(None))):
        y_vals, predictions, characteristics = simple_grab(
            ['Y', 'predictions', 'characteristics'], **inp)
    else:
        raise NotImplementedError(
            "Need to write code for getting characteristics strait from EventChars collection"
        )

    if (isinstance(y_vals, (list, tuple))):
        raise ValueError(
            "Error multiple outputs is ambiguous, got %r outputs" %
            len(y_vals))

    true_class_index = inp.get('true_class_index', -1)
    threshold = inp.get('threshold', -1)
    if (len(y_vals.shape) == 1 or y_vals.shape[-1] == 1):
        true_class_index = 0
    elif (true_class_index == -1):
        raise ValueError("Must provide a true_class_index.")

    split_vals, y_bins, predict_bins = distribute_to_bins(
        characteristics, (y_vals, predictions))

    out_bins = []
    prevmax = split_vals[0]
    for i, (p, y) in enumerate(zip(predict_bins, y_bins)):
        b = prediction_statistics(target=y,
                                  predictions=p,
                                  true_class_index=true_class_index,
                                  threshold=threshold)
        b["min_bin_x"] = prevmax
        b["max_bin_x"] = prevmax = split_vals[i + 1]
        out_bins.append(b)

    return out_bins
Example #8
0
mName = 'fix_chPi_dnn'

test_dir = "/bigdata/shared/LCD2018/ChPiEscan/test/"

os.environ['CUDA_VISIBLE_DEVICES'] = ''

mod = ('/nfshome/vitoriabp/gpu-4-culture-plate-sm/new_ds_notebooks/' + mName +
       '.json')  # model file
w = ('/nfshome/vitoriabp/gpu-4-culture-plate-sm/new_ds_notebooks/' + mName +
     '.h5')  # weights file

# grab y and predictions together
all_y, all_pred, = simple_grab(['Y', 'predictions'],
                               data=test_dir,
                               label_keys='energy',
                               input_keys=['ECAL', 'HCAL'],
                               model=mod,
                               weights=w)

all_pred = np.reshape(all_pred, (all_pred[0], ))

#print(all_y.shape)
#print(all_pred.shape)

saveTruePred(name=mName, true=all_y, pred=all_pred)

histEDif(target=all_y, pred=all_pred)

histRelDif(target=all_y, pred=all_pred)

x, y, means, rMeans, stds, rStds, sizes, res = binning(10, all_y.ravel(),
Example #9
0
# In[177]:

ultimate_plot(rStds_gamma, rStds_ele, rStds_pi0, rStds_chPi, gamma_bins,
              gamma_res, ele_bins, ele_res, pi0_bins, pi0_res, chPi_bins,
              chPi_res)

# In[48]:

## To get the predictions for all the files!

# grab targets (y) for GammaEscan
test_dir = "/bigdata/shared/LCD/V1/GammaEscan/test/"

y_gamma = simple_grab('Y',
                      data=test_dir,
                      label_keys='target',
                      input_keys=['ECAL', 'HCAL'])
y_gamma = y_gamma[:, 1:]
y_gamma = y_gamma.ravel()

# In[49]:

# grab values to input in the linear regression for GammaEscan
ecal, hcal = simple_grab('X',
                         data=test_dir,
                         label_keys=['ECAL', 'HCAL'],
                         input_keys=['ECAL', 'HCAL'])

s_ecal = sumCal(ecal)
s_hcal = sumCal(hcal)