def load_data_target_min_classification_2(input_file, max_size=2679): just_file_name = os.path.splitext(os.path.split(input_file)[1])[0] + '_' file_path = input_file location = 'B4' branches = ['rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer'] types = ['float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64'] max_size = [max_size for i in range(len(branches))] data, sizes = sparse_hgcal.read_np_array(file_path, location, branches, types, max_size) num_samples = len(data[0]) data_joined = np.concatenate([x[..., np.newaxis] for x in data], axis=2).astype(np.float32) pairs = find_unique_pairs(num_samples) print((data_joined).shape) showers_1 = data_joined[pairs[:, 0]] showers_2 = data_joined[pairs[:, 1]] # TODO: Order the elements if you want to! shower_combined = np.concatenate((showers_1, showers_2), axis=2) shower_combined = shower_combined[:, :, [5, 12, 0, 1, 2, 3, 4, 6]] # [Energy 1, Energy 2, x, y, z, vxy, vz, layer] return shower_combined
def run_conversion_multi_threaded(input_file): global jobs_queue, max_gpu_events just_file_name = os.path.splitext(os.path.split(input_file)[1])[0] + '_' file_path = input_file location = 'B4' branches = [ 'rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'isElectron', 'isMuon', 'isPionCharged', 'isPionNeutral', 'isK0Long', 'isK0Short' ] types = [ 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32' ] max_size = [3000, 3000, 3000, 3000, 3000, 3000, 3000, 1, 1, 1, 1, 1, 1] data, sizes = sparse_hgcal.read_np_array(file_path, location, branches, types, max_size) ex_data = [ np.expand_dims(group, axis=2) for group in [data[0], data[1], data[2], data[3], data[4], data[5], data[6]] ] ex_data_lables = [ np.expand_dims(group, axis=2) for group in [data[7], data[8], data[9], data[10], data[11], data[12]] ] all_features = np.concatenate( (ex_data[0], ex_data[1], ex_data[2], ex_data[5], ex_data[6]), axis=2) spatial = np.concatenate((ex_data[0], ex_data[1], ex_data[2]), axis=2) spatial_local = np.concatenate((ex_data[3], ex_data[4]), axis=2) labels_one_hot = np.concatenate(tuple(ex_data_lables), axis=1) num_entries = sizes[0] assert int(np.mean(np.sum(labels_one_hot, axis=1))) == 1 total_events = len(sizes[0]) assert np.array_equal(np.shape(all_features), [total_events, 3000, 5]) assert np.array_equal(np.shape(spatial), [total_events, 3000, 3]) assert np.array_equal(np.shape(spatial_local), [total_events, 3000, 2]) events_per_jobs = int(total_events / jobs) for i in range(jobs): start = i * events_per_jobs stop = (i + 1) * events_per_jobs A = all_features[start:stop] B = spatial[start:stop] C = spatial_local[start:stop] D = labels_one_hot[start:stop] E = num_entries[start:stop] output_file_prefix = os.path.join(args.output, just_file_name + "_" + str(i) + "_") data_packed = A, B, C, D, E, i, output_file_prefix jobs_queue.put(data_packed) jobs_queue.join()
def run_conversion_simple(input_file): np.set_printoptions(threshold=np.nan) location = 'B4' branches = [ 'rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'true_x', 'true_y', 'true_r', 'true_energy', 'isGamma' ] types = [ 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32' ] max_size = [2679 for _ in range(7)] + [1, 1, 1, 1, 1] nparray, sizes = sparse_hgcal.read_np_array(input_file, location, branches, types, max_size) if args.isGamma: isGamma = np.where(nparray[11] == 1) nparray = [x[isGamma] for x in nparray] true_values_1 = np.concatenate( [nparray[i][..., np.newaxis] for i in [7, 8, 9, 10]], axis=1) # common part: common = concat_all_branches(nparray, [0, 1, 2, 3, 4, 6]) positions = concat_all_branches(nparray, [0, 1, 2])[0] sizes_1 = make_fixed_array(nparray[3], expand=False)[0] colors = make_fixed_array(nparray[6], expand=False)[0] sizes_2 = make_fixed_array(nparray[4], expand=False)[0] plot_calo(positions, sizes_1, colors)
def run_conversion_simple(input_file, firstrun=False): np.set_printoptions(threshold=np.nan) location = 'B4' branches = [ 'rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'true_x', 'true_y', 'true_r', 'true_energy', 'isGamma' ] types = [ 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32' ] max_size = [2679 for _ in range(7)] + [1, 1, 1, 1, 1] nparray, sizes = sparse_hgcal.read_np_array(input_file, location, branches, types, max_size) print(len(nparray)) if args.isGamma: isGamma = np.where(nparray[11] == 1) nparray = [x[isGamma] for x in nparray] true_values_1 = np.concatenate( [nparray[i][..., np.newaxis] for i in [7, 8, 9, 10]], axis=1) # common part: common = concat_all_branches(nparray, [0, 1, 2, 3, 4, 6]) positions = concat_all_branches(nparray, [0, 1, 2]) energy1 = make_fixed_array(nparray[5], expand=False) shuffleindices = np.array(range(1, energy1.shape[0])) shuffleindices = np.concatenate([shuffleindices, np.array([0])]) energy2 = energy1[shuffleindices] true_values_2 = true_values_1[shuffleindices] print(energy1.shape) maxenergyids1 = energy1.argmax(axis=1) maxenergyids2 = energy2.argmax(axis=1) positions1 = positions[range(energy1.shape[0]), maxenergyids1] positions2 = positions[range(energy2.shape[0]), maxenergyids2] # np.logical_or(a, b) diff = positions1 - positions2 diff[:, 0] *= 1e6 diff[:, 1] *= 1e3 totdiff = diff[:, 0] + diff[:, 1] + diff[:, 2] # print(totdiff) # common = common[totdiff != 0] esum = energy2 + energy1 print(esum.shape) fraction1 = energy1 / esum # np.ma.masked_array(esum, mask=esum==0) fraction1[esum == 0] = 0 fraction2 = energy2 / esum fraction2[esum == 0] = 0 fraction_temp = np.array(fraction1) fraction1[totdiff > 0] = fraction2[totdiff > 0] fraction2[totdiff > 0] = fraction_temp[totdiff > 0] true_values_temp = np.array(true_values_1) true_values_1[totdiff > 0] = true_values_2[totdiff > 0] true_values_2[totdiff > 0] = true_values_temp[totdiff > 0] # prepare additional information about the seeds # BX1 maxenergy1, make same ordering maxenergyids1_temp = np.array(maxenergyids1) maxenergyids1[totdiff > 0] = maxenergyids2[totdiff > 0] maxenergyids2[totdiff > 0] = maxenergyids1_temp[totdiff > 0] maxenergyids1 = np.expand_dims(maxenergyids1, axis=1) maxenergyids2 = np.expand_dims(maxenergyids2, axis=1) moreinfo = np.concatenate([maxenergyids1, maxenergyids2], axis=-1) esum = np.expand_dims(esum, axis=2) fraction1 = np.expand_dims(fraction1, axis=2) fraction2 = np.expand_dims(fraction2, axis=2) allout = np.concatenate([esum, common, fraction1, fraction2], axis=-1) zeropad = np.zeros(shape=(moreinfo.shape[0], allout.shape[2] - moreinfo.shape[1])) moreinfo = np.concatenate([moreinfo, zeropad], axis=-1) moreinfo = np.expand_dims(moreinfo, axis=1) allout = np.concatenate([allout, moreinfo], axis=1) # allout = allout[totdiff!=0] #remove same seeded showers output_data = [] for i in range(len(allout)): if totdiff[i] != 0: output_data.append((allout[i], true_values_1[i], true_values_2[i])) if firstrun: print('output shape ', allout.shape) print('last entry in axis 1 is: [idx seed0, idx seed1, 0, ...]') print('other are: [esum,rechit_x, rechit_y, rechit_z, rechit_vxy, ' 'rechit_vz, rechit_layer, fraction1, fraction2]') print('ordering of seed0 and seed1 is done in order by: ' 'x,y,z. Events with same positioned seeds are removed') just_file_name = os.path.splitext(os.path.split(input_file)[1])[0] + '_' output_file_prefix = os.path.join(args.output, just_file_name) write_to_tf_records(output_data, 0, output_file_prefix)
def load_data_target_min_classification(input_file): just_file_name = os.path.splitext(os.path.split(input_file)[1])[0] + '_' file_path = input_file location = 'B4' branches = [ 'rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'rechit_detid', 'isElectron', 'isMuon', 'isPionCharged', 'isPionNeutral', 'isK0Long', 'isK0Short' ] types = [ 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32' ] max_size = [ 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 1, 1, 1, 1, 1, 1 ] data, sizes = sparse_hgcal.read_np_array(file_path, location, branches, types, max_size) ex_ids = data[7] ex_data = [ np.expand_dims(group, axis=2) for group in [data[0], data[1], data[2], data[3], data[4], data[5], data[6]] ] ex_data_lables = [ np.expand_dims(group, axis=2) for group in [data[7], data[8], data[9], data[10], data[11], data[12]] ] ex_sizes = sizes[0] other_features = np.concatenate((ex_data[5], ex_data[6]), axis=2) spatial = np.concatenate((ex_data[0], ex_data[1], ex_data[2]), axis=2) spatial_local = np.concatenate((ex_data[3], ex_data[4]), axis=2) num_entries = sizes[0] indices_1 = np.arange(len(data[0])) np.random.shuffle(indices_1) indices_2 = np.arange(len(data[0])) np.random.shuffle(indices_2) pairs = np.concatenate( (np.expand_dims(indices_1, axis=1), np.expand_dims(indices_2, axis=1)), axis=1) output_data_all = list() for i, j in pairs: if i == j: continue ids_1 = ex_ids[i].astype(np.int32) # [3000] features_others_1 = other_features[i] energy_1 = features_others_1[:, 0] features_spatial_local_1 = spatial_local[i] features_spatial_1 = spatial[i] # VxF features_combined_1 = np.concatenate( (features_others_1, features_spatial_1, features_spatial_local_1), axis=1).astype(np.float32) sizes_1 = ex_sizes[i].astype(np.int32) location_1 = np.sum(features_spatial_1 * energy_1[..., np.newaxis], axis=0) / np.sum(energy_1) ids_2 = ex_ids[j].astype(np.int32) features_others_2 = other_features[j] energy_2 = (other_features[j])[:, 0] features_spatial_local_2 = spatial_local[j] features_spatial_2 = spatial[j] features_combined_2 = np.concatenate( (features_others_2, features_spatial_2, features_spatial_local_2), axis=1).astype(np.float32) sizes_2 = ex_sizes[j].astype(np.int32) location_2 = np.sum(features_spatial_2 * energy_2[..., np.newaxis], axis=0) / np.sum(energy_2) comparison = location_1[1] < location_1[1] if abs( location_1[0] - location_1[0]) < 1 else location_1[0] < location_1[0] if comparison: output_data_all.append( (features_combined_1, features_combined_2, ids_1, ids_2, sizes_1, sizes_2, location_1, location_2)) else: output_data_all.append( (features_combined_2, features_combined_1, ids_2, ids_1, sizes_2, sizes_1, location_2, location_1)) features_1 = np.concatenate([(x[0])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) features_2 = np.concatenate([(x[1])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) ids_1 = np.concatenate([(x[2])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) ids_2 = np.concatenate([(x[3])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) sizes_1 = np.concatenate([(x[4])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) sizes_2 = np.concatenate([(x[5])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) locations_1 = np.concatenate([(x[6])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) locations_2 = np.concatenate([(x[7])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) merged_features, _, num_entries_result = merge_two_arrays_separate( features_1, features_2, ids_1, ids_2, sizes_1, sizes_2, ) energies_1 = merged_features[:, :, 0] energies_2 = merged_features[:, :, 1] e = energies_1 + energies_2 fractions = energies_1 / np.ma.masked_array(e, mask=e == 0) # 10000x6000 target_0 = np.array(fractions)[:, :, np.newaxis] target_1 = np.array(1 - fractions)[:, :, np.newaxis] unmerged_energies = np.copy(merged_features) merged_features[:, :, 1] += merged_features[:, :, 0] merged_features = merged_features[:, :, 1:] assert np.array_equal(np.shape(np.array(target_0[0])), [6000, 1]) data_output = np.concatenate((merged_features, target_0, target_1), axis=2).astype(np.float32) total_events = len(target_0) assert np.array_equal(np.shape(merged_features[0]), [6000, 7]) assert len(target_0) == len(num_entries_result) and len(target_0) == len( merged_features) return_dict = {} return_dict['num_entries_result'] = num_entries_result return_dict['data_output_merged'] = data_output return_dict['data_output_unmerged_energies'] = unmerged_energies return_dict['fractions_random'] = unmerged_energies print("Returning min loss dataset") return return_dict
def load_data_target_center(input_file): file_path = input_file location = 'B4' branches = ['rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'rechit_detid', 'isElectron', 'isMuon', 'isPionCharged', 'isPionNeutral', 'isK0Long', 'isK0Short'] types = ['float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32'] # Maximum number of elements in the column? But why are the final sizes '1' max_size = [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 1, 1, 1, 1, 1, 1] data, sizes = sparse_hgcal.read_np_array(file_path, location, branches, types, max_size) ex_ids = data[7] ex_data = [np.expand_dims(group, axis=2) for group in [data[0], data[1], data[2], data[3], data[4], data[5], data[6]]] ex_data_lables = [np.expand_dims(group, axis=2) for group in [data[7], data[8], data[9], data[10], data[11], data[12]]] ex_sizes = sizes[0] other_features = np.concatenate((ex_data[5], ex_data[6]), axis=2) spatial = np.concatenate((ex_data[0], ex_data[1], ex_data[2]), axis=2) spatial_local = np.concatenate((ex_data[3], ex_data[4]), axis=2) num_entries = sizes[0] indices_1 = np.arange(len(data[0])) np.random.shuffle(indices_1) indices_2 = np.arange(len(data[0])) np.random.shuffle(indices_2) pairs = np.concatenate((np.expand_dims(indices_1, axis=1), np.expand_dims(indices_2, axis=1)), axis=1) output_data_all = list() for i, j in pairs: if i == j: continue ids_1 = ex_ids[i].astype(np.int32) # [3000] features_others_1 = other_features[i] energy_1 = features_others_1[:,0] features_spatial_local_1 = spatial_local[i] features_spatial_1 = spatial[i] # VxF features_combined_1 = np.concatenate((features_others_1, features_spatial_1, features_spatial_local_1), axis=1).astype(np.float32) sizes_1 = ex_sizes[i].astype(np.int32) location_1 = np.sum(features_spatial_1 * energy_1[..., np.newaxis], axis=0) / np.sum(energy_1) ids_2 = ex_ids[j].astype(np.int32) features_others_2 = other_features[j] energy_2 = (other_features[j])[:,0] features_spatial_local_2 = spatial_local[j] features_spatial_2 = spatial[j] features_combined_2 = np.concatenate((features_others_2, features_spatial_2, features_spatial_local_2), axis=1).astype(np.float32) sizes_2 = ex_sizes[j].astype(np.int32) location_2 = np.sum(features_spatial_2 * energy_2[..., np.newaxis], axis=0) / np.sum(energy_2) output_data_all.append((features_combined_1, features_combined_2, ids_1, ids_2, sizes_1, sizes_2, location_1, location_2)) features_1 = np.concatenate([(x[0])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) features_2 = np.concatenate([(x[1])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) ids_1 = np.concatenate([(x[2])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) ids_2 = np.concatenate([(x[3])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) sizes_1 = np.concatenate([(x[4])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) sizes_2 = np.concatenate([(x[5])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.int32) locations_1 = np.concatenate([(x[6])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) locations_2 = np.concatenate([(x[7])[np.newaxis, ...] for x in output_data_all], axis=0).astype(np.float32) merged_features, _, num_entries_result = merge_two_arrays_separate( features_1, features_2, ids_1, ids_2, sizes_1, sizes_2, ) energies_1 = merged_features[:, :, 0] energies_2 = merged_features[:, :, 1] e = energies_1 + energies_2 fractions = energies_1/np.ma.masked_array(e, mask=e==0) # 10000x6000 target = fractions[..., np.newaxis] * locations_1[:, np.newaxis, :] + (1 - fractions[..., np.newaxis]) * locations_2[:, np.newaxis, :] # target[e==0, :] = 0 target = np.array(target) unmerged_energies = np.copy(merged_features) merged_features[:, :, 1] += merged_features[:, :, 0] merged_features = merged_features[:, :, 1:] data_output = np.concatenate((merged_features, target), axis=2).astype(np.float32) total_events = len(target) assert np.array_equal(np.shape(merged_features[0]), [6000, 7]) assert np.array_equal(np.shape(target[0]), [6000, 3]) assert len(target) == len(num_entries_result) and len(target) == len(merged_features) return_dict = {} return_dict['num_entries_result'] = num_entries_result return_dict['data_output_merged'] = data_output return_dict['data_output_unmerged_energies'] = unmerged_energies return return_dict
args = parser.parse_args() file = args.input location = 'B4' branches = [ 'isElectron', 'isMuon', 'isPionCharged', 'isPionNeutral', 'isK0Long', 'isK0Short', 'rechit_energy', 'rechit_x', 'rechit_y', 'rechit_z' ] types = [ 'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'float64', 'float64', 'float64', 'float64' ] max_sizes = [1, 1, 1, 1, 1, 1, 3000, 3000, 3000, 3000] print("Loading data") data, sizes = hg.read_np_array(file, location, branches, types, max_sizes) print("Data loaded") events = np.size(data[0]) E = data[6] X = data[7] Y = data[8] Z = data[9] T = np.concatenate((np.expand_dims(data[0], 1), np.expand_dims( data[1], 1), np.expand_dims(data[2], 1), np.expand_dims( data[3], 1), np.expand_dims(data[4], 1), np.expand_dims(data[5], 1)), axis=1)
def run_conversion_multi_threaded(input_file): global jobs_queuze, max_gpu_events just_file_name = os.path.splitext(os.path.split(input_file)[1])[0] + '_' file_path = input_file location = 'B4' branches = [ 'rechit_x', 'rechit_y', 'rechit_z', 'rechit_vxy', 'rechit_vz', 'rechit_energy', 'rechit_layer', 'isElectron', 'isMuon', 'isPionCharged', 'isPionNeutral', 'isK0Long', 'isK0Short' ] types = [ 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32' ] max_size = [3000, 3000, 3000, 3000, 3000, 3000, 3000, 1, 1, 1, 1, 1, 1] data, sizes = sparse_hgcal.read_np_array(file_path, location, branches, types, max_size) per_rechit_data = np.concatenate([ np.expand_dims(group, axis=2) for group in [data[0], data[1], data[2], data[3], data[4], data[5], data[6]] ], axis=2) labels_one_hot = np.concatenate([ np.expand_dims(group, axis=2) for group in [data[7], data[8], data[9], data[10], data[11], data[12]] ], axis=1) num_entries = sizes[0] if args.pion_vs_electron: labels_indexed = np.argmax(labels_one_hot, axis=1) interesting_indices = np.where((labels_indexed == 0) + (labels_indexed == 3)) labels_indexed = labels_indexed[interesting_indices] labels_indexed[labels_indexed == 3] = 1 labels_one_hot = one_hot(labels_indexed, num_classes=2) per_rechit_data = data[interesting_indices] num_entries = num_entries[interesting_indices] total_events = len(labels_one_hot) assert int(np.mean(np.sum(labels_one_hot, axis=1))) == 1 assert np.array_equal(np.shape(per_rechit_data), [total_events, 3000, 7]) jobs = int(args.jobs) processes = [] events_per_jobs = int(total_events / jobs) for i in range(jobs): start = i * events_per_jobs stop = (i + 1) * events_per_jobs per_rechit_data_job = per_rechit_data[start:stop] labels_jobs = labels_one_hot[start:stop] num_entries_jobs = num_entries[start:stop] output_file_prefix = os.path.join(args.output, just_file_name + "_" + str(i) + "_") data_packed = per_rechit_data_job, labels_jobs, num_entries_jobs, output_file_prefix processes.append(Process(target=worker, args=(data_packed, ))) for p in processes: p.start() for p in processes: p.join()