def test_open_run(mock_spb_raw_run, mock_spb_proc_run, tmpdir): prop_dir = os.path.join(str(tmpdir), 'SPB', '201830', 'p002012') # Set up raw os.makedirs(os.path.join(prop_dir, 'raw')) os.symlink(mock_spb_raw_run, os.path.join(prop_dir, 'raw', 'r0238')) # Set up proc os.makedirs(os.path.join(prop_dir, 'proc')) os.symlink(mock_spb_proc_run, os.path.join(prop_dir, 'proc', 'r0238')) with mock.patch('karabo_data.read_machinery.DATA_ROOT_DIR', str(tmpdir)): # With integers run = open_run(proposal=2012, run=238) paths = {f.filename for f in run.files} assert paths for path in paths: assert '/raw/' in path # With strings run = open_run(proposal='2012', run='238') assert {f.filename for f in run.files} == paths # Proc folder run = open_run(proposal=2012, run=238, data='proc') proc_paths = {f.filename for f in run.files} assert proc_paths for path in proc_paths: assert '/raw/' not in path # Run that doesn't exist with pytest.raises(Exception): open_run(proposal=2012, run=999)
def find_run(self, target: TrainSet): # karabo_data requires the explicit creation of run objects. For # now, this code does not support TrainSets spanning multiple # runs, but this should be possible with only minor performance # impact through proper caching. # First, we hope to locate the run by checking the first entry # against already loaded runs for run_id, train_ids in self.runs.items(): if target.train_ids[0] in train_ids: return karabo_data.open_run(proposal=self.proposal, run=run_id) # If unsuccessful, load more runs into the cache. if self.path is None: run = self._open_run(1) if target.train_ids[0] in self.runs[1]: return run for entry in listdir(self.path): run_id = int(entry[1:]) if run_id not in self.runs: run = self._open_run(run_id) if target.train_ids[0] in self.runs[run_id]: return run raise ValueError('unable to locate run ID for TrainSet')
def _open_run(self, run_id): run = karabo_data.open_run(proposal=self.proposal, run=run_id) self.runs[run_id] = frozenset(run.train_ids) base = dirname(run.files[0].filename) self.path = base[:base.rfind('/')] return run
def process_intra_train(job): '''Aggregate DSSC data (chunked, to fit into memory) for a single module. Averages over all trains, but keeps all pulses. Designed for the multiprocessing module - expects a job dictionary with the following keys: proposal : (int) proposal number run : (int) run number module : (int) DSSC module to process chunksize : (int) number of trains to process simultaneously fpt : (int) frames per train ''' proposal = job['proposal'] run_nr = job['run_nr'] module = job['module'] chunksize = job['chunksize'] fpt = job['fpt'] maxframes = job.get('maxframes', None) # optional sourcename = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf' collection = kd.open_run(proposal, run_nr, include=f'*DSSC{module:02d}*') fpt = min(fpt, maxframes) if maxframes is not None else fpt dims = ['pulse', 'x', 'y'] coords = {'pulse': np.arange(fpt, dtype=int)} shape = [fpt, 128, 512] module_data = xr.DataArray(np.zeros(shape, dtype=float), dims=dims, coords=coords) module_data = module_data.to_dataset(name='image') module_data['sum_count'] = xr.DataArray(np.zeros(fpt, dtype=int), dims=['pulse']) ntrains = len(collection.train_ids) chunks = np.arange(ntrains, step=chunksize) if module == 15: pbar = tqdm(total=len(chunks)) for start_index in chunks: sel = collection.select_trains(kd.by_index[start_index:start_index + chunksize]) data = load_chunk_data(sel, sourcename, maxframes) data = data.to_dataset(name='image') data['sum_count'] = xr.full_like(data.image[..., 0, 0], fill_value=1) data = data.sum('trainId') for var in ['image', 'sum_count']: # concatenating and using the sum() method automatically takes care of dtype casting if necessary module_data[var] = xr.concat([module_data[var], data[var]], dim='tmp').sum('tmp') if module == 15: pbar.update(1) module_data['image'] = module_data['image'] / module_data.sum_count return module_data
def save_nws_trace(run_number): ''' save normalized, wrapped, summed trace Should work. ''' proposal_number = 2318 # The proposal number for this beamtime try: r = open_run(proposal=proposal_number, run=run_number) print("Opened run: " + str(run_number) + "\tfor proposal: " + str(proposal_number)) except: print("Run: " + str(run_number) + " not found, exiting") return t_zero = 18000 # train time offset unit: ns/2 chunk_size = 1 train_IDs = r.train_ids number_of_trains = len(train_IDs) # determine the pulses per train intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensitySa3TD')[0] for i in range(len(intensities_0)): if intensities_0[i] == 1.0: pulses_per_train = i print("Pulses per train: " + str(pulses_per_train)) break pulse_time = 24 * 1760 train_time = pulse_time * pulses_per_train sum_trace = 0 for i in range(number_of_trains): i_shift = math.floor(i / chunk_size) * chunk_size sel = r.select_trains(by_index[i:i + chunk_size]) trace_by_train = np.array(sel.get_array( 'SQS_DIGITIZER_UTC1/ADC/1:network', 'digitizers.channel_1_A.raw.samples', None, roi=by_index[t_zero:t_zero + train_time]), dtype=np.float64)[0] trace_by_train *= (-1) wrapped_trace = wrap_raw_trace(trace_by_train, pulses_per_train, pulse_time) sum_trace += wrapped_trace #plt.plot(sum_trace) #plt.show() #plt.pause(0.005) normalized_sum_trace = normalize_wrapped_trace(sum_trace) local_dir = os.path.dirname('__file__') target_file = os.path.join(local_dir, 'run_' + str(run_number) + '_proc2.h5') f = h5.File(target_file, 'a') f.create_dataset("normalized_sum_trace", data=normalized_sum_trace) f.close()
def save_photon_energy(run_number): ''' WIP ''' proposal_number = 2318 # The proposal number for this beamtime try: r = open_run(proposal=proposal_number, run=run_number) print("Opened run: " + str(run_number) + "\tfor proposal: " + str(proposal_number)) except: print("Run: " + str(run_number) + " not found, exiting") return # get the photon energy of the run # units are eV, rounded to 3 decimal places # the Germans had a try except for this, I could add that... #hv = round(float(r.get_array('SA3_XTD10_UND/DOOCS/PHOTON_ENERGY', 'actualPosition.value')[0]), 3) #print("Photon Energy for run "+str(run_number)+": "+str(hv)+ "eV") hv = r.get_array('SA3_XTD10_UND/DOOCS/PHOTON_ENERGY', 'actualPosition.value') plt.plot(hv) plt.show() #print(hv) return
def process_dssc_module(job): '''Aggregate DSSC data (chunked, to fit into memory) for a single module. Groups by "scan_variable" in given scanfile - use dummy scan_variable to average over all trains. This implies, that only trains found in the scanfile are considered. Designed for the multiprocessing module - expects a job dictionary with the following keys: proposal : (int) proposal number run : (int) run number module : (int) DSSC module to process chunksize : (int) number of trains to process simultaneously scanfile : (str) name of hdf5 file with xarray.DataArray containing the scan variable and trainIds framepattern : (list of str) names for the (possibly repeating) intra-train pulses. See split_dssc_data pulsemask : (str) name of hdf5 file with boolean xarray.DataArray to select/reject trains and pulses ''' proposal = job['proposal'] run_nr = job['run_nr'] module = job['module'] chunksize = job['chunksize'] scanfile = job['scanfile'] framepattern = job.get('framepattern', ['image']) maskfile = job.get('maskfile', None) sourcename = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf' collection = kd.open_run(proposal, run_nr, include=f'*DSSC{module:02d}*') ntrains = len(collection.train_ids) # read preprocessed scan variable from file - selection and (possibly) rounding already done. scan = xr.open_dataarray(scanfile, 'data', autoclose=True) # read binary pulse/train mask - e.g. from XGM thresholding if maskfile is not None: pulsemask = xr.open_dataarray(maskfile, 'data', autoclose=True) else: pulsemask = None module_data = prepare_module_empty(scan, framepattern) chunks = np.arange(ntrains, step=chunksize) if module == 15: # quick and dirty progress bar pbar = tqdm(total=len(chunks)) for start_index in chunks: sel = collection.select_trains(kd.by_index[start_index:start_index + chunksize]) nframes = sel.detector_info(sourcename)['total_frames'] if nframes > 0: # some chunks have no DSSC data at all data = load_chunk_data(sel, sourcename) sum_count = xr.full_like(data[..., 0, 0], fill_value=1) if pulsemask is not None: data = data.where(pulsemask) sum_count = sum_count.where(pulsemask) data = split_frames(data, framepattern) sum_count = split_frames(sum_count, framepattern, prefix='sum_count_') data = xr.merge([data, sum_count]) data[ 'scan_variable'] = scan # aligns on trainId, drops non-matching trains data = data.groupby('scan_variable').sum('trainId') module_data = merge_chunk_data(module_data, data, framepattern) if module == 15: pbar.update(1) for name in framepattern: module_data[name] = module_data[name] / module_data['sum_count_' + name] return module_data
def load_dssc_info(proposal, run_nr): '''Loads the first data file for DSSC module 0 (this is hardcoded) and returns the detector_info dictionary''' dssc_module = kd.open_run(proposal, run_nr, include='*DSSC00*') dssc_info = dssc_module.detector_info('SCS_DET_DSSC1M-1/DET/0CH0:xtdf') return dssc_info
def save_peaks_per_pulse(run_number, starting_index, trains): ''' ''' proposal_number = 2318 # The proposal number for this beamtime # see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html prominence = 100 # Prominence is an arbitrary threshhold used to select peaks. Edwin used 100. wlen = 20 # 'wlen' or window length helps speed up peak finding algorithm, see above link. # A run is composed of many(!) trains # a train is composed of many pulses # a pulse is composed of many photons every_x_pulse_in_train = 24 # no units, an amount t_min_period = 1760 # t_min_period is when pulses are in consecutive bunches. unit: ns/2 t_zero = 18000 # train time offset unit: ns/2 bin_size = 1 # unit for tof spectrum unit: ns/2 pulse_time = every_x_pulse_in_train * t_min_period # time for one pulse unit: ns/2 try: r = open_run(proposal=proposal_number, run=run_number) print("Opened run: " + str(run_number) + "\tfor proposal: " + str(proposal_number)) except: print("Run: " + str(run_number) + " not found, exiting") return local_dir = os.path.dirname('__file__') target_file = os.path.join(local_dir, 'run_' + str(run_number) + '_proc2.h5') # get all IDs in train train_IDs = r.train_ids number_of_trains = len(train_IDs) print(number_of_trains) # determine # of pulses per train, break loop once found intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensitySa3TD')[0] for i in range(len(intensities_0)): if intensities_0[i] == 1.0: pulses_per_train = i print("Pulses per train: " + str(pulses_per_train)) break # calculate constants for tof spectrum train_time = pulse_time * pulses_per_train # time for the train unit: ns/2 number_of_pulses = number_of_trains * pulses_per_train # number of pulses in run # create empty array tof_count_per_pulse = [] # how many traces (trains) to load into data at once # right now this code will only work with chunk_size = 1 # which is really slow, but is needed for ions per pulse! # see Todo^ chunk_size = 1 f = h5.File(target_file, 'a') try: # create a group for all pulses f.create_group( '/tof_peaks_per_pulse' ) #, (number_of_pulses,range(0, pulse_time-1, bin_size)), dtype='int8', maxshape=(None,)) f.close() except: print("group already made!") f.close() # iterate over each train in the run to avoid overallocating memory # used for naming! dont worry lol max_zeros = (len(str(number_of_trains))) * "0" for i in range(starting_index, starting_index + trains): if i > number_of_trains: # all done return else: i_shift = math.floor(i / chunk_size) * chunk_size sel = r.select_trains(by_index[i:i + chunk_size]) trace_by_train = np.array(sel.get_array( 'SQS_DIGITIZER_UTC1/ADC/1:network', 'digitizers.channel_1_A.raw.samples', None, roi=by_index[t_zero:t_zero + train_time]), dtype=np.float64)[0] trace_by_train *= (-1) peaks = sci.signal.find_peaks(trace_by_train, prominence=prominence, wlen=wlen) # peaks[0]: indicies in single_train_traces of peaks! # peaks[1]: dictionary of properties # find out which pulse the peaks belongs to # divide the index within trace where a peak was found by the time of each pulse # pulse_of_peaks will look like [1,1,1,2,2,3,5,6,...] # an array with the same shape as peaks[0] # it takes the peaks found and generates an array of the same shape # except each index lists the pulse which that peak belongs to # t_min_period pulse_of_peaks = list(np.floor(peaks[0] / pulse_time).astype(int)) # get the number of peaks per pulse # this many element will need to be removed from peaks[0] # and evaulated as that pulse n_peaks_per_pulse = np.bincount(pulse_of_peaks, minlength=pulses_per_train) # Wrap the time of flight of peaks[0] # create empty array same shape as all_peaks t_peaks = np.empty_like(peaks[0]) # populate array with the modulus (remainder) of all_peaks and pulse_time # each element of all_peaks is the index within the single_train_traces which it came out of # that represents a 'peak' of the data # I am guessing that if the data is collected at a certain rate we can interact with it # in this way t_peaks = np.mod(peaks[0], pulse_time) # make all_peaks a list all_peaks = t_peaks.tolist() # next I need to split this up to a pulse by pulse basis f = h5.File(target_file, 'a') counter = 0 for peak_count in n_peaks_per_pulse: abs_pulse_num = pulses_per_train * i + counter peaks_in_pulse = [] for p in range(peak_count): # get the 0th element p times peaks_in_pulse.append(all_peaks.pop(0)) peaks_in_pulse = np.array(peaks_in_pulse) group = f.get('tof_peaks_per_pulse') counter += 1 # how many places of 0s total - how long the abs pulse num (pulse id) is zeros = len(str(number_of_trains)) - len(str(abs_pulse_num)) added_zeros = "0" * zeros dataset_name = "pulse_" + str(added_zeros) + str(abs_pulse_num) try: # save as unsigned int 16 to save space! group.create_dataset(str(dataset_name), data=peaks_in_pulse, dtype='uint16') except: print( "Error! that pulse is already saved, check parameters." ) print("i=", i) pass f.close() return
def save_intensity(run_number): ''' Inputs: run_number: just the number of which run we are reducing see comment on process_raw_run for more info Output: proc file is created if it does not exist, if it exists, the int is added. Todo: idk Note: This is part of me splitting up the process_raw_run to get around my time restrictions on the cluster ''' proposal_number = 2318 # The proposal number for this beamtime try: r = open_run(proposal=proposal_number, run=run_number) print("Opened run: " + str(run_number) + "\tfor proposal: " + str(proposal_number)) except: print("Run: " + str(run_number) + " not found, exiting") return train_IDs = r.train_ids number_of_trains = len(train_IDs) print(number_of_trains) # determine the pulses per train intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensitySa3TD')[0] for i in range(len(intensities_0)): if intensities_0[i] == 1.0: pulses_per_train = i print("Pulses per train: " + str(pulses_per_train)) break # 2D inensity list # region of interest is the each pulse x each train, which is also shape of array # get the intensity associated with each pulse within each train # xgm is X-Ray Gas Monitor # xgm has 1. Two beam intensity monitors to measure intensity along both x and y direction. # and 2. Two beam position monitors to determine x and y beam positions. # xgm is a 2d array with the shape (n trains, 1000) intensity_list = np.array( r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensitySa3TD', roi=by_index[0:pulses_per_train])[0:number_of_trains]) # and flatten the intensity list into a 1D array, so it has the same shape as peaks_in_pulse # that is, total number of pulses intensity_per_pulse = intensity_list.flatten() print(max(intensity_per_pulse)) print(min(intensity_per_pulse)) plt.hist(intensity_per_pulse, bins=500) plt.show() local_dir = os.path.dirname('__file__') target_file = os.path.join(local_dir, 'run_' + str(run_number) + '_proc2.h5') f = h5.File(target_file, 'a') try: f.create_dataset("intensity_per_pulse", data=intensity_per_pulse) f.close() except: print("Int already saved!") f.close() return return