def test_read_write_hdf(tmpdir, input_arr): tmp_file = tmpdir / "example.h5" # Write with h5py.File(str(tmp_file), "w") as hf: a = awkward0.JaggedArray.fromiter(input_arr) ah5 = awkward0.hdf5(hf) ah5["example"] = a # Read with h5py.File(str(tmp_file), "r") as hf: ah5 = awkward0.hdf5(hf) b = ah5["example"] assert a.tolist() == b.tolist()
def collect_truth(*files, pvs=True): """ This function collects the truth information from files as awkward arrays (JaggedArrays). Give it the same files as collect_data. pvs: Collect PVs or SVs (default True: PVs) """ x_list = [] y_list = [] z_list = [] n_list = [] c_list = [] p = "p" if pvs else "s" for XY_file in files: msg = f"Loaded {XY_file} in {{time:.4}} s" with Timer(msg), h5py.File(XY_file, mode="r") as XY: afile = awkward.hdf5(XY) x_list.append(afile[f"{p}v_loc_x"]) y_list.append(afile[f"{p}v_loc_y"]) z_list.append(afile[f"{p}v_loc"]) n_list.append(afile[f"{p}v_ntracks"]) c_list.append(afile[f"{p}v_cat"]) return VertexInfo( concatenate(x_list), concatenate(y_list), concatenate(z_list), concatenate(n_list), concatenate(c_list), )
def save_data_hdf5(hf, od, filelist=None, compression="lzf"): dset = hf.create_dataset("kernel", data=od.X, compression=compression) if filelist: dset.attrs["files"] = np.string_(",".join( str(s.stem) for s in filelist)) hf.create_dataset("pv", data=od.Y[0], compression=compression) hf.create_dataset("sv", data=od.Y[2], compression=compression) hf.create_dataset("pv_other", data=od.Y[1], compression=compression) hf.create_dataset("sv_other", data=od.Y[3], compression=compression) hf.create_dataset("Xmax", data=od.Xmax, compression=compression) hf.create_dataset("Ymax", data=od.Ymax, compression=compression) akdh5 = awkward.hdf5(hf) akdh5["pv_loc_x"] = od.pv_loc_x akdh5["pv_loc_y"] = od.pv_loc_y akdh5["pv_loc"] = od.pv_loc akdh5["pv_ntracks"] = od.pv_ntracks akdh5["pv_cat"] = od.pv_cat akdh5["sv_loc_x"] = od.sv_loc_x akdh5["sv_loc_y"] = od.sv_loc_y akdh5["sv_loc"] = od.sv_loc akdh5["sv_ntracks"] = od.sv_ntracks akdh5["sv_cat"] = od.sv_cat return dset
def save_data_hdf5(hf, od, filelist=None, compression="lzf"): dset = hf.create_dataset("kernel", data=od.X, compression=compression) if filelist: dset.attrs["files"] = np.string_(",".join(str(s.stem) for s in filelist)) hf.create_dataset("pv", data=od.Y[0], compression=compression) hf.create_dataset("sv", data=od.Y[2], compression=compression) hf.create_dataset("pv_other", data=od.Y[1], compression=compression) hf.create_dataset("sv_other", data=od.Y[3], compression=compression) hf.create_dataset("Xmax", data=od.Xmax, compression=compression) hf.create_dataset("Ymax", data=od.Ymax, compression=compression) ## added 200922 hf.create_dataset("poca_KDE_A", data=od.poca_KDE_A, compression=compression) hf.create_dataset("poca_KDE_A_xMax", data=od.poca_KDE_A_xMax, compression=compression) hf.create_dataset("poca_KDE_A_yMax", data=od.poca_KDE_A_yMax, compression=compression) hf.create_dataset("poca_KDE_B", data=od.poca_KDE_B, compression=compression) hf.create_dataset("poca_KDE_B_xMax", data=od.poca_KDE_B_xMax, compression=compression) hf.create_dataset("poca_KDE_B_yMax", data=od.poca_KDE_B_yMax, compression=compression) akdh5 = awkward.hdf5(hf) akdh5["pv_loc_x"] = od.pv_loc_x akdh5["pv_loc_y"] = od.pv_loc_y akdh5["pv_loc"] = od.pv_loc akdh5["pv_ntracks"] = od.pv_ntracks akdh5["pv_cat"] = od.pv_cat akdh5["sv_loc_x"] = od.sv_loc_x akdh5["sv_loc_y"] = od.sv_loc_y akdh5["sv_loc"] = od.sv_loc akdh5["sv_ntracks"] = od.sv_ntracks akdh5["sv_cat"] = od.sv_cat akdh5["recon_x"] = od.recon_x akdh5["recon_y"] = od.recon_y akdh5["recon_z"] = od.recon_z akdh5["recon_tx"] = od.recon_tx akdh5["recon_ty"] = od.recon_ty ## mds akdh5["recon_pocax"] = od.recon_pocax ## mds akdh5["recon_pocay"] = od.recon_pocay ## mds akdh5["recon_pocaz"] = od.recon_pocaz ## mds akdh5["recon_sigmapocaxy"] = od.recon_sigmapocaxy ## added 200922 akdh5["poca_x"] = od.poca_x akdh5["poca_y"] = od.poca_y akdh5["poca_z"] = od.poca_z akdh5["major_axis_x"] = od.major_axis_x akdh5["major_axis_y"] = od.major_axis_y akdh5["major_axis_z"] = od.major_axis_z akdh5["minor_axis1_x"] = od.minor_axis1_x akdh5["minor_axis1_y"] = od.minor_axis1_y akdh5["minor_axis1_z"] = od.minor_axis1_z akdh5["minor_axis2_x"] = od.minor_axis2_x akdh5["minor_axis2_y"] = od.minor_axis2_y akdh5["minor_axis2_z"] = od.minor_axis2_z return dset
def collect_t2kde_data( *files, batch_size=1, dtype=np.float32, device=None, slice=None, **kargs, ): """ This function collects data. It does not split it up. You can pass in multiple files. Example: collect_data('a.h5', 'b.h5') batch_size: The number of events per batch dtype: Select a different dtype (like float16) slice: Allow just a slice of data to be loaded device: The device to load onto (CPU by default) **kargs: Any other keyword arguments will be passed on to torch's DataLoader """ ## these unit vectors will be used to convert the elements of ## the ellipsoid major and minor axis vectors into vectors xhat = np.array([1, 0, 0]) yhat = np.array([0, 1, 0]) zhat = np.array([0, 0, 1]) Xlist = [] Ylist = [] print("Loading data...") for XY_file in files: msg = f"Loaded {XY_file} in {{time:.4}} s" with Timer(msg), h5py.File(XY_file, mode="r") as f: ## [:,np.newaxis,:] makes X (a x b) --> (a x 1 x b) (axis 0, axis 1, axis 2) ## a is *probably* 4000 and b is *probably* N, but it could be the other ## way around; check iwth .shape ## Here we read in the KDE itself plus the values of x and y where the KDE is maximal for ## each bin of z. It appears that in the test file the original KDE values .AND. the values ## of Xmax and Ymax have been divided by 2500. This should have been done only for the ## KDE values, so Xmax and Ymax are re-scaled to better use the dynamic range available ## using np.float16 ## mds 200729 the KDE targets have many zeros. Learning zeros using a ratio ## mds of predicted to target means that overestimating by a small ## mds amount in the cost function, even adding an epsilon-like parameter## mds there is difficult. Let's explicitly add epsilon here. ## mds We might be able to do it equally well in the cost function, ## mds but doing it here makes plotting easy as well. epsilon = 0.001 ## mds 201019 kernel = np.asarray(f["kernel"]) + epsilon ## we want to use the poca KDE, not the original kernel kernel = np.asarray(f["poca_KDE_B"]) + epsilon Xmax = 2500.*np.asarray(f["poca_KDE_B_xMax"]) Ymax = 2500.*np.asarray(f["poca_KDE_B_yMax"]) Y = ja.concatenate((kernel,Xmax,Ymax),axis=1).astype(dtype_Y) ## now build the feature set from the relevant tracks' parameters ## we need to use "afile" to account for the variable length ## structure of the awkward arrays ## 201018 use poca ellipsoid parameter rather than "track parameters" afile = awkward.hdf5(f) pocaz = np.asarray(0.001*afile["poca_z"].astype(dtype_Y)) pocax = np.asarray(afile["poca_x"].astype(dtype_Y)) pocay = np.asarray(afile["poca_y"].astype(dtype_Y)) pocaMx = np.asarray(afile["major_axis_x"].astype(dtype_Y)) print("pocaMx.shape = ", pocaMx.shape) pocaMy = np.asarray(afile["major_axis_y"].astype(dtype_Y)) pocaMz = np.asarray(afile["major_axis_z"].astype(dtype_Y)) nEvts = len(pocaz) print("nEvts = ", nEvts) print("len(pocaMx[0]) = ", len(pocaMx[0])) print("len(pocaMx[1]) = ", len(pocaMx[1])) print("len(pocaMx[2]) = ", len(pocaMx[2])) print("len(pocaMx[3]) = ", len(pocaMx[3])) print("len(pocaMx[4]) = ", len(pocaMx[4])) Mx = np.multiply(pocaMx.reshape(nEvts,1),xhat) My = np.multiply(pocaMy.reshape(nEvts,1),yhat) Mz = np.multiply(pocaMz.reshape(nEvts,1),zhat) majorAxis = Mx+My+Mz print("majorAxis.shape = ",majorAxis.shape) poca_m1x = np.asarray(afile["minor_axis1_x"].astype(dtype_Y)) poca_m1y = np.asarray(afile["minor_axis1_y"].astype(dtype_Y)) poca_m1z = np.asarray(afile["minor_axis1_z"].astype(dtype_Y)) mx = np.multiply(poca_m1x.reshape(nEvts,1),xhat) my = np.multiply(poca_m1y.reshape(nEvts,1),yhat) mz = np.multiply(poca_m1z.reshape(nEvts,1),zhat) minorAxis_1 = mx+my+mz print("minorAxis_1.shape = ",minorAxis_1.shape) poca_m2x = np.asarray(afile["minor_axis2_x"].astype(dtype_Y)) poca_m2y = np.asarray(afile["minor_axis2_y"].astype(dtype_Y)) poca_m2z = np.asarray(afile["minor_axis2_z"].astype(dtype_Y)) mx = np.multiply(poca_m2x.reshape(nEvts,1),xhat) my = np.multiply(poca_m2y.reshape(nEvts,1),yhat) mz = np.multiply(poca_m2z.reshape(nEvts,1),zhat) minorAxis_2 = mx+my+mz print("minorAxis_2.shape = ",minorAxis_1.shape) A, B, C, D, E, F = six_ellipsoid_parameters(majorAxis,minorAxis_1,minorAxis_2) print("A.shape = ",A.shape) for iTrk in range(3): print("majorAxis[iTrk][0][0] = ",majorAxis[iTrk][0][0]) print("majorAxis[iTrk][1][0] = ",majorAxis[iTrk][1][0]) print("majorAxis[iTrk][2][0] = ",majorAxis[iTrk][2][0]) print("minorAxis_1[iTrk][0][0] = ",minorAxis_1[iTrk][0][0]) print("minorAxis_1[iTrk][1][0] = ",minorAxis_1[iTrk][1][0]) print("minorAxis_1[iTrk][2][0] = ",minorAxis_1[iTrk][2][0]) print("minorAxis_2[iTrk][0][0] = ",minorAxis_2[iTrk][0][0]) print("minorAxis_2[iTrk][1][0] = ",minorAxis_2[iTrk][1][0]) print("minorAxis_2[iTrk][2][0] = ",minorAxis_2[iTrk][2][0]) print(" ") ## mdsAA print("A[iTrk][0] = ",A[iTrk][0]) ## mdsAA print("B[iTrk][0] = ",B[iTrk][0]) ## mdsAA print("C[iTrk][0] = ",C[iTrk][0]) ## mdsAA print("D[iTrk][0] = ",D[iTrk][0]) ## mdsAA print("E[iTrk][0] = ",E[iTrk][0]) ## mdsAA print("F[iTrk][0] = ",F[iTrk][0]) ## mds print("majorAxis[iTrk][0] = ", majorAxis[iTrk][0]) ## mds print("majorAxis[iTrk][1] = ", majorAxis[iTrk][1]) ## mds print("majorAxis[iTrk][2] = ", majorAxis[iTrk][2]) ## mark non-track data with -99 as a flag maxLen = 600 ## for safety: 600 >> 481, which is what was seen for 100 evts padded_pocaz = np.zeros((nEvts,maxLen))-99. padded_pocax = np.zeros((nEvts,maxLen))-99. padded_pocay = np.zeros((nEvts,maxLen))-99. padded_pocaA = np.zeros((nEvts,maxLen))-99. padded_pocaB = np.zeros((nEvts,maxLen))-99. padded_pocaC = np.zeros((nEvts,maxLen))-99. padded_pocaD = np.zeros((nEvts,maxLen))-99. padded_pocaE = np.zeros((nEvts,maxLen))-99. padded_pocaF = np.zeros((nEvts,maxLen))-99. for i, e in enumerate(pocaz): fillingLength = min(len(e),maxLen) padded_pocaz[i,:fillingLength] = pocaz[i][:fillingLength].astype(dtype_Y) padded_pocax[i,:fillingLength] = pocax[i][:fillingLength].astype(dtype_Y) padded_pocay[i,:fillingLength] = pocay[i][:fillingLength].astype(dtype_Y) padded_pocaA[i,:fillingLength] = A[i][:fillingLength].astype(dtype_Y) padded_pocaB[i,:fillingLength] = B[i][:fillingLength].astype(dtype_Y) padded_pocaC[i,:fillingLength] = C[i][:fillingLength].astype(dtype_Y) padded_pocaD[i,:fillingLength] = D[i][:fillingLength].astype(dtype_Y) padded_pocaE[i,:fillingLength] = E[i][:fillingLength].astype(dtype_Y) padded_pocaF[i,:fillingLength] = F[i][:fillingLength].astype(dtype_Y) padded_pocaz = padded_pocaz[:,np.newaxis,:] padded_pocax = padded_pocax[:,np.newaxis,:] padded_pocay = padded_pocay[:,np.newaxis,:] padded_pocaA = padded_pocaA[:,np.newaxis,:] padded_pocaB = padded_pocaB[:,np.newaxis,:] padded_pocaC = padded_pocaC[:,np.newaxis,:] padded_pocaD = padded_pocaD[:,np.newaxis,:] padded_pocaE = padded_pocaE[:,np.newaxis,:] padded_pocaF = padded_pocaF[:,np.newaxis,:] X = ja.concatenate((padded_pocaz,padded_pocax,padded_pocay,padded_pocaA,padded_pocaB,padded_pocaC,padded_pocaD,padded_pocaE,padded_pocaF),axis=1).astype(dtype_X) ## mds print("X = ",X) print("len(X) = ",len(X)) Xlist.append(X) Ylist.append(Y) print("len(Xlist) = ",len(Xlist)) X = np.concatenate(Xlist, axis=0) Y = np.concatenate(Ylist, axis=0) print("outer loop X.shape = ", X.shape) if slice: X = X[slice, :] Y = Y[slice, :] with Timer(start=f"Constructing {X.shape[0]} event dataset"): x_t = torch.tensor(X) y_t = torch.tensor(Y) if device is not None: x_t = x_t.to(device) y_t = y_t.to(device) dataset = TensorDataset(x_t, y_t) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, **kargs) print("x_t.shape = ",x_t.shape) print("x_t.shape[0] = ", x_t.shape[0]) print("x_t.shape[1] = ", x_t.shape[1]) nFeatures = 6 x_t.view(x_t.shape[0],nFeatures,-1) print("x_t.shape = ",x_t.shape) return loader
def collect_t2kde_data( *files, batch_size=1, dtype=np.float32, device=None, slice=None, **kargs, ): """ This function collects data. It does not split it up. You can pass in multiple files. Example: collect_data('a.h5', 'b.h5') batch_size: The number of events per batch dtype: Select a different dtype (like float16) slice: Allow just a slice of data to be loaded device: The device to load onto (CPU by default) **kargs: Any other keyword arguments will be passed on to torch's DataLoader """ Xlist = [] Ylist = [] print("Loading data...") for XY_file in files: msg = f"Loaded {XY_file} in {{time:.4}} s" with Timer(msg), h5py.File(XY_file, mode="r") as f: ## [:,np.newaxis,:] makes X (a x b) --> (a x 1 x b) (axis 0, axis 1, axis 2) ## a is *probably* 4000 and b is *probably* N, but it could be the other ## way around; check iwth .shape ## Here we read in the KDE itself plus the values of x and y where the KDE is maximal for ## each bin of z. It appears that in the test file the original KDE values .AND. the values ## of Xmax and Ymax have been divided by 2500. This should have been done only for the ## KDE values, so Xmax and Ymax are re-scaled to better use the dynamic range available ## using np.float16 kernel = np.asarray(f["kernel"]) Xmax = 2500. * np.asarray(f["Xmax"]) Ymax = 2500. * np.asarray(f["Ymax"]) Y = ja.concatenate((kernel, Xmax, Ymax), axis=1).astype(dtype_Y) ## now build the feature set from the relevant tracks' parameters ## we need to usse "afile" to account for the variable length ## structure of the awkward arrays afile = awkward.hdf5(f) pocaz = np.asarray(0.001 * afile["recon_pocaz"].astype(dtype_Y)) pocax = np.asarray(afile["recon_pocax"].astype(dtype_Y)) pocay = np.asarray(afile["recon_pocay"].astype(dtype_Y)) pocaTx = np.asarray(afile["recon_tx"].astype(dtype_Y)) pocaTy = np.asarray(afile["recon_ty"].astype(dtype_Y)) pocaSigmapocaxy = np.asarray( afile["recon_sigmapocaxy"].astype(dtype_Y)) nEvts = len(pocaz) ## mds for testing only for i in range(nEvts-1): ## mds for testing only maxLen = max(maxLen,len(pocaz[i])) ## mds for testing only print("maxLen = ",maxLen) ## mark non-track data with -99 as a flag maxLen = 600 ## for safety: 600 >> 481, which is what was seen for 100 evts padded_pocaz = np.zeros((nEvts, maxLen)) - 99. padded_pocax = np.zeros((nEvts, maxLen)) - 99. padded_pocay = np.zeros((nEvts, maxLen)) - 99. padded_tx = np.zeros((nEvts, maxLen)) - 99. padded_ty = np.zeros((nEvts, maxLen)) - 99. padded_sigma = np.zeros((nEvts, maxLen)) - 99. for i, e in enumerate(pocaz): fillingLength = min(len(e), maxLen) padded_pocaz[i, :fillingLength] = pocaz[ i][:fillingLength].astype(dtype_Y) padded_pocax[i, :fillingLength] = pocax[ i][:fillingLength].astype(dtype_Y) padded_pocay[i, :fillingLength] = pocay[ i][:fillingLength].astype(dtype_Y) padded_tx[i, :fillingLength] = pocaTx[ i][:fillingLength].astype(dtype_Y) padded_ty[i, :fillingLength] = pocaTy[ i][:fillingLength].astype(dtype_Y) padded_sigma[i, :fillingLength] = pocaSigmapocaxy[ i][:fillingLength].astype(dtype_Y) padded_pocaz = padded_pocaz[:, np.newaxis, :] padded_pocax = padded_pocax[:, np.newaxis, :] padded_pocay = padded_pocay[:, np.newaxis, :] padded_tx = padded_tx[:, np.newaxis, :] padded_ty = padded_ty[:, np.newaxis, :] padded_sigma = padded_sigma[:, np.newaxis, :] X = ja.concatenate((padded_pocaz, padded_pocax, padded_pocay, padded_tx, padded_ty, padded_sigma), axis=1).astype(dtype_X) ## mds print("X = ",X) print("len(X) = ", len(X)) Xlist.append(X) Ylist.append(Y) print("len(Xlist) = ", len(Xlist)) X = np.concatenate(Xlist, axis=0) Y = np.concatenate(Ylist, axis=0) print("outer loop X.shape = ", X.shape) if slice: X = X[slice, :] Y = Y[slice, :] with Timer(start=f"Constructing {X.shape[0]} event dataset"): x_t = torch.tensor(X) y_t = torch.tensor(Y) if device is not None: x_t = x_t.to(device) y_t = y_t.to(device) dataset = TensorDataset(x_t, y_t) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, **kargs) print("x_t.shape = ", x_t.shape) print("x_t.shape[0] = ", x_t.shape[0]) print("x_t.shape[1] = ", x_t.shape[1]) nFeatures = 6 x_t.view(x_t.shape[0], nFeatures, -1) print("x_t.shape = ", x_t.shape) return loader
def collect_poca(*files): #initialize lists pocax_list = [] pocay_list = [] pocaz_list = [] majoraxisx_list = [] majoraxisy_list = [] majoraxisz_list = [] minoraxis1x_list = [] minoraxis1y_list = [] minoraxis1z_list = [] minoraxis2x_list = [] minoraxis2y_list = [] minoraxis2z_list = [] #iterate through all files for XY_file in files: msg = f"Loaded {XY_file} in {{time:.4}} s" with h5py.File(XY_file, mode="r") as XY: #print keys in current hdf5 file print(XY.keys()) afile = awkward.hdf5(XY) #append to appropriate lists pocax_list.append(afile["poca_x"]) pocay_list.append(afile["poca_y"]) pocaz_list.append(afile["poca_z"]) majoraxisx_list.append(afile["major_axis_x"]) majoraxisy_list.append(afile["major_axis_y"]) majoraxisz_list.append(afile["major_axis_z"]) minoraxis1x_list.append(afile["minor_axis1_x"]) minoraxis1y_list.append(afile["minor_axis1_y"]) minoraxis1z_list.append(afile["minor_axis1_z"]) minoraxis2x_list.append(afile["minor_axis2_x"]) minoraxis2y_list.append(afile["minor_axis2_y"]) minoraxis2z_list.append(afile["minor_axis2_z"]) #construct pocas dictionary pocas = {} pocas["x"] = { "poca": concatenate(pocax_list), "major_axis": concatenate(majoraxisx_list), "minor_axis1": concatenate(minoraxis1x_list), "minor_axis2": concatenate(minoraxis2x_list) } pocas["y"] = { "poca": concatenate(pocay_list), "major_axis": concatenate(majoraxisy_list), "minor_axis1": concatenate(minoraxis1y_list), "minor_axis2": concatenate(minoraxis2y_list) } pocas["z"] = { "poca": concatenate(pocaz_list), "major_axis": concatenate(majoraxisz_list), "minor_axis1": concatenate(minoraxis1z_list), "minor_axis2": concatenate(minoraxis2z_list) } return pocas