def preSum(train_dir, particle=""): ''' To be used before training for data visualization. Naive sum of the shower deposits in the ECAL and HCAL. :type train_dir: str. :parameter train_dir: path to the training directory with HDF5 files. :type particle: str. :parameter particle: name of the particle. :return: energy targets and energy sum arrays. :rtype: numpy.ndarray, numpy.ndarray; shape: (n,) shape: (n,) ''' # grab targets (y) all_y = simple_grab('Y', data=train_dir, label_keys='target', input_keys=['ECAL', 'HCAL']) all_y = all_y[:, 1:] all_y = all_y.ravel() #print(all_y.shape) # sum of ECAL and HCAL inSum = inpSum(train_dir) # save arrays to HDF5 saveSum_toHDF5(particle, all_y, inSum) return all_y, inSum
def get_roc_data(**kargs): '''Get ROC curve **tpr**, **fpr**, **thresholds**, and **auc** from labels and predictions. Takes all arguments availiable to :py:func:`CMS_Deep_Learning.io.simple_grab` :param ROC_data: a tuple (fpr, tpr,thres,roc_auc) containing the roc parametrization and the auc :param *: Any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions** ''' inp = kargs if ("ROC_data" in inp): fpr, tpr, thres, roc_auc = inp["ROC_data"] else: from sklearn.metrics import roc_curve, auc labels, predictions = simple_grab(['Y', 'predictions'], **inp) true_class_index = kargs.get("true_class_index", None) assert labels.shape == predictions.shape, "labels and predictions should have \ the same shape, %r != %r" % (labels.shape, predictions.shape) n = labels.shape[0] if (len(labels.shape) > 1 and labels.shape[1] > 1): if (true_class_index != None): labels = labels[:, true_class_index].ravel() predictions = predictions[:, true_class_index].ravel() else: raise ValueError( "must designate index of true class for data of shape %r" % list(labels.shape)) fpr, tpr, thres = roc_curve(labels, predictions) roc_auc = auc(fpr, tpr) return fpr, tpr, thres, roc_auc
def inpSum(dir): ''' Naive sum of the shower deposits in the ECAL and HCAL. :type dir: str. :param dir: path to the directory with HDF5 files. :return: sum of the ECAL and HCAL sums. :rtype: numpy.ndarray, shape: (n,) ''' # grab ECAL and HCAL inputs ecal, hcal = simple_grab('X', data=dir, label_keys=['ECAL', 'HCAL'], input_keys=['ECAL', 'HCAL']) # sums s_ecal = sumCal(ecal) s_hcal = sumCal(hcal) # reshape sum output s_ecal = s_ecal.ravel() s_hcal = s_hcal.ravel() # total sum inSum = s_ecal + s_hcal return inSum
def grab(direc): y_ele = simple_grab('Y', data=direc, label_keys='target', input_keys=['ECAL', 'HCAL']) y_ele = y_ele[:, 1:] y_ele = y_ele.ravel() ecal_ele, hcal_ele = simple_grab('X', data=direc, label_keys=['ECAL', 'HCAL'], input_keys=['ECAL', 'HCAL']) s_ecal_ele = sumCal(ecal_ele) s_hcal_ele = sumCal(hcal_ele) s_ecal_ele = s_ecal_ele.ravel() s_hcal_ele = s_hcal_ele.ravel() return y_ele, s_ecal_ele, s_hcal_ele
def grab_ch(direc): # grab targets (y) for EleEscan y_ele = simple_grab('Y', data=direc, label_keys='good_target', input_keys=['good_ECAL', 'good_HCAL']) y_ele = y_ele[:, 1:] y_ele = y_ele.ravel() # grab and calculate values for regression model ecal_ele, hcal_ele = simple_grab('X', data=direc, label_keys=['good_ECAL', 'good_HCAL'], input_keys=['good_ECAL', 'good_HCAL']) s_ecal_ele = sumCal(ecal_ele) s_hcal_ele = sumCal(hcal_ele) s_ecal_ele = s_ecal_ele.ravel() s_hcal_ele = s_hcal_ele.ravel() return y_ele, s_ecal_ele, s_hcal_ele
def get_roc_points(tpr=[], fpr=[], thresh=[], class_fprs={}, class_labels=None, suppress_warnings=False, verbose=0, **kargs): '''Finds the tpr,fpr, and threshold holding one of them constant. :param tpr: a list of true positive rates to hold constant :param fpr: a list of false positive rates to hold constant :param class_fprs: a dictionary keyed by class index of false positive rates for each false class to keep constant :param thresh: a list of thesholds to hold constant :param *: Any argument available to :py:func:`CMS_Deep_Learning.postprocessing.metrics.get_roc_data` to get **ROC_data**, and by extension any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions** :returns: a list of lists of tuples correspondind to the ROC points evaluated for the different trials ''' # --------------------- Grabbing Data ------------------------- try: kargs["Y"], kargs["predictions"] = simple_grab(['Y', 'predictions'], **kargs) except Exception as e: if (verbose > 0): print(e) roc_data = get_roc_data(**kargs) _fpr, _tpr, _thresh, auc = roc_data # ------------------------------------------------------------ # --------------------Decompose contamination by class--------------------- if ("Y" in kargs and "predictions" in kargs and "true_class_index" in kargs): separated_conts = get_class_fprs(kargs["Y"], kargs["predictions"], _thresh, kargs["true_class_index"]) elif (not suppress_warnings): import warnings warnings.warn("Cannot compute CLASS CONTAMINATIONS unless user inputs necessary data " + \ "for computing Y and predictions, in addition to true_class_index") # ------------------------------------------------------------------- # ------------------------Find the closest points------------------- def indxClosest(target, lst): index, elmt = min(enumerate(lst), key=lambda x: abs(x[1] - target)) return index indicies = [] indicies += [indxClosest(y, _fpr) for y in fpr] indicies += [indxClosest(y, _tpr) for y in tpr] indicies += [indxClosest(y, _thresh) for y in thresh] for key, val in class_fprs.items(): indicies += [indxClosest(y, separated_conts[key]) for y in val] fpr, tpr, thresh = _fpr[indicies], _tpr[indicies], _thresh[indicies] out = {"tpr": tpr, "fpr": fpr, "thresh": thresh} for j, val in separated_conts.items(): label = class_labels[j] if class_labels != None else str(j) label = "fpr:" + label out[label] = val[indicies] # ----------------------------------------------------------------------- return out
def bin_metric_vs_char(args=[], nb_bins=20, equalBins=False, plot=False, **kargs): '''Computes event features and and returns binned data about the accuracy of a model against those features. Also computes the standard error for each bin. :param nb_bins: The number of bins to use in the analysis. :param equalBins: True/False, Defualt False. If True, will try to put an equal number of samples in each bin. This should probably be left False or else the bins will be very unusual, varying significantly in their domain. :param plot: If True plot the bins automatically. :type plot: bool :param bins: A list of dictionaries outputted by CMS_Deep_Learning.postprocessing.metrics.bin_metric_vs_char :type bins: list of dict :param threshold: The threshold for the classifier for the True class. All other classes are collectively the False class. :type threshold: float :param true_class_index: The index in the output vector corresponding to the True class element.All other classes are collectively the False class. :type true_class_index: int :param *: Any argument available to :py:func:`CMS_Deep_Learning.io.simple_grab` to get **Y**, **predictions**, **characteristics** :returns: A list of dictionaries each containing information about a bin. The output of this can be plotted with CMS_Deep_Learning.postprocessing.plot.plot_bins ''' inputs = args if (len(args) == 0): inputs = [kargs] else: raise NotImplementedError("Have not written to take multiple inputs") inp = inputs[0] if (not isinstance(inp.get('characteristics', None), type(None)) or not isinstance(inp.get('accumulate', None), type(None))): y_vals, predictions, characteristics = simple_grab( ['Y', 'predictions', 'characteristics'], **inp) else: raise NotImplementedError( "Need to write code for getting characteristics strait from EventChars collection" ) if (isinstance(y_vals, (list, tuple))): raise ValueError( "Error multiple outputs is ambiguous, got %r outputs" % len(y_vals)) true_class_index = inp.get('true_class_index', -1) threshold = inp.get('threshold', -1) if (len(y_vals.shape) == 1 or y_vals.shape[-1] == 1): true_class_index = 0 elif (true_class_index == -1): raise ValueError("Must provide a true_class_index.") split_vals, y_bins, predict_bins = distribute_to_bins( characteristics, (y_vals, predictions)) out_bins = [] prevmax = split_vals[0] for i, (p, y) in enumerate(zip(predict_bins, y_bins)): b = prediction_statistics(target=y, predictions=p, true_class_index=true_class_index, threshold=threshold) b["min_bin_x"] = prevmax b["max_bin_x"] = prevmax = split_vals[i + 1] out_bins.append(b) return out_bins
mName = 'fix_chPi_dnn' test_dir = "/bigdata/shared/LCD2018/ChPiEscan/test/" os.environ['CUDA_VISIBLE_DEVICES'] = '' mod = ('/nfshome/vitoriabp/gpu-4-culture-plate-sm/new_ds_notebooks/' + mName + '.json') # model file w = ('/nfshome/vitoriabp/gpu-4-culture-plate-sm/new_ds_notebooks/' + mName + '.h5') # weights file # grab y and predictions together all_y, all_pred, = simple_grab(['Y', 'predictions'], data=test_dir, label_keys='energy', input_keys=['ECAL', 'HCAL'], model=mod, weights=w) all_pred = np.reshape(all_pred, (all_pred[0], )) #print(all_y.shape) #print(all_pred.shape) saveTruePred(name=mName, true=all_y, pred=all_pred) histEDif(target=all_y, pred=all_pred) histRelDif(target=all_y, pred=all_pred) x, y, means, rMeans, stds, rStds, sizes, res = binning(10, all_y.ravel(),
# In[177]: ultimate_plot(rStds_gamma, rStds_ele, rStds_pi0, rStds_chPi, gamma_bins, gamma_res, ele_bins, ele_res, pi0_bins, pi0_res, chPi_bins, chPi_res) # In[48]: ## To get the predictions for all the files! # grab targets (y) for GammaEscan test_dir = "/bigdata/shared/LCD/V1/GammaEscan/test/" y_gamma = simple_grab('Y', data=test_dir, label_keys='target', input_keys=['ECAL', 'HCAL']) y_gamma = y_gamma[:, 1:] y_gamma = y_gamma.ravel() # In[49]: # grab values to input in the linear regression for GammaEscan ecal, hcal = simple_grab('X', data=test_dir, label_keys=['ECAL', 'HCAL'], input_keys=['ECAL', 'HCAL']) s_ecal = sumCal(ecal) s_hcal = sumCal(hcal)