Ejemplo n.º 1
0
def test_open_run(mock_spb_raw_run, mock_spb_proc_run, tmpdir):
    prop_dir = os.path.join(str(tmpdir), 'SPB', '201830', 'p002012')
    # Set up raw
    os.makedirs(os.path.join(prop_dir, 'raw'))
    os.symlink(mock_spb_raw_run, os.path.join(prop_dir, 'raw', 'r0238'))

    # Set up proc
    os.makedirs(os.path.join(prop_dir, 'proc'))
    os.symlink(mock_spb_proc_run, os.path.join(prop_dir, 'proc', 'r0238'))

    with mock.patch('karabo_data.read_machinery.DATA_ROOT_DIR', str(tmpdir)):
        # With integers
        run = open_run(proposal=2012, run=238)
        paths = {f.filename for f in run.files}

        assert paths
        for path in paths:
            assert '/raw/' in path

        # With strings
        run = open_run(proposal='2012', run='238')
        assert {f.filename for f in run.files} == paths

        # Proc folder
        run = open_run(proposal=2012, run=238, data='proc')

        proc_paths = {f.filename for f in run.files}
        assert proc_paths
        for path in proc_paths:
            assert '/raw/' not in path

        # Run that doesn't exist
        with pytest.raises(Exception):
            open_run(proposal=2012, run=999)
Ejemplo n.º 2
0
    def find_run(self, target: TrainSet):
        # karabo_data requires the explicit creation of run objects. For
        # now, this code does not support TrainSets spanning multiple
        # runs, but this should be possible with only minor performance
        # impact through proper caching.

        # First, we hope to locate the run by checking the first entry
        # against already loaded runs
        for run_id, train_ids in self.runs.items():
            if target.train_ids[0] in train_ids:
                return karabo_data.open_run(proposal=self.proposal, run=run_id)

        # If unsuccessful, load more runs into the cache.

        if self.path is None:
            run = self._open_run(1)

            if target.train_ids[0] in self.runs[1]:
                return run

        for entry in listdir(self.path):
            run_id = int(entry[1:])

            if run_id not in self.runs:
                run = self._open_run(run_id)

                if target.train_ids[0] in self.runs[run_id]:
                    return run

        raise ValueError('unable to locate run ID for TrainSet')
Ejemplo n.º 3
0
    def _open_run(self, run_id):
        run = karabo_data.open_run(proposal=self.proposal, run=run_id)
        self.runs[run_id] = frozenset(run.train_ids)

        base = dirname(run.files[0].filename)
        self.path = base[:base.rfind('/')]

        return run
Ejemplo n.º 4
0
def process_intra_train(job):
    '''Aggregate DSSC data (chunked, to fit into memory) for a single module.
    Averages over all trains, but keeps all pulses.
    Designed for the multiprocessing module - expects a job dictionary with the following keys:
      proposal : (int) proposal number
      run : (int) run number
      module : (int) DSSC module to process
      chunksize : (int) number of trains to process simultaneously
      fpt : (int) frames per train
    '''
    proposal = job['proposal']
    run_nr = job['run_nr']
    module = job['module']
    chunksize = job['chunksize']
    fpt = job['fpt']
    maxframes = job.get('maxframes', None)  # optional

    sourcename = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf'
    collection = kd.open_run(proposal, run_nr, include=f'*DSSC{module:02d}*')

    fpt = min(fpt, maxframes) if maxframes is not None else fpt
    dims = ['pulse', 'x', 'y']
    coords = {'pulse': np.arange(fpt, dtype=int)}
    shape = [fpt, 128, 512]
    module_data = xr.DataArray(np.zeros(shape, dtype=float),
                               dims=dims,
                               coords=coords)
    module_data = module_data.to_dataset(name='image')
    module_data['sum_count'] = xr.DataArray(np.zeros(fpt, dtype=int),
                                            dims=['pulse'])

    ntrains = len(collection.train_ids)
    chunks = np.arange(ntrains, step=chunksize)
    if module == 15:
        pbar = tqdm(total=len(chunks))
    for start_index in chunks:
        sel = collection.select_trains(kd.by_index[start_index:start_index +
                                                   chunksize])
        data = load_chunk_data(sel, sourcename, maxframes)
        data = data.to_dataset(name='image')

        data['sum_count'] = xr.full_like(data.image[..., 0, 0], fill_value=1)
        data = data.sum('trainId')

        for var in ['image', 'sum_count']:
            # concatenating and using the sum() method automatically takes care of dtype casting if necessary
            module_data[var] = xr.concat([module_data[var], data[var]],
                                         dim='tmp').sum('tmp')
        if module == 15:
            pbar.update(1)

    module_data['image'] = module_data['image'] / module_data.sum_count
    return module_data
Ejemplo n.º 5
0
Archivo: proc2.py Proyecto: sdunc/xfel
def save_nws_trace(run_number):
    '''
    save normalized, wrapped, summed trace
    Should work. 
    '''
    proposal_number = 2318  # The proposal number for this beamtime
    try:
        r = open_run(proposal=proposal_number, run=run_number)
        print("Opened run: " + str(run_number) + "\tfor proposal: " +
              str(proposal_number))
    except:
        print("Run: " + str(run_number) + " not found, exiting")
        return
    t_zero = 18000  # train time offset unit: ns/2
    chunk_size = 1

    train_IDs = r.train_ids
    number_of_trains = len(train_IDs)
    # determine the pulses per train
    intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output',
                                'data.intensitySa3TD')[0]
    for i in range(len(intensities_0)):
        if intensities_0[i] == 1.0:
            pulses_per_train = i
            print("Pulses per train: " + str(pulses_per_train))
            break
    pulse_time = 24 * 1760
    train_time = pulse_time * pulses_per_train
    sum_trace = 0
    for i in range(number_of_trains):
        i_shift = math.floor(i / chunk_size) * chunk_size
        sel = r.select_trains(by_index[i:i + chunk_size])
        trace_by_train = np.array(sel.get_array(
            'SQS_DIGITIZER_UTC1/ADC/1:network',
            'digitizers.channel_1_A.raw.samples',
            None,
            roi=by_index[t_zero:t_zero + train_time]),
                                  dtype=np.float64)[0]
        trace_by_train *= (-1)
        wrapped_trace = wrap_raw_trace(trace_by_train, pulses_per_train,
                                       pulse_time)
        sum_trace += wrapped_trace
        #plt.plot(sum_trace)
        #plt.show()
        #plt.pause(0.005)
    normalized_sum_trace = normalize_wrapped_trace(sum_trace)
    local_dir = os.path.dirname('__file__')
    target_file = os.path.join(local_dir,
                               'run_' + str(run_number) + '_proc2.h5')
    f = h5.File(target_file, 'a')
    f.create_dataset("normalized_sum_trace", data=normalized_sum_trace)
    f.close()
Ejemplo n.º 6
0
Archivo: proc2.py Proyecto: sdunc/xfel
def save_photon_energy(run_number):
    '''
    WIP
    '''
    proposal_number = 2318  # The proposal number for this beamtime
    try:
        r = open_run(proposal=proposal_number, run=run_number)
        print("Opened run: " + str(run_number) + "\tfor proposal: " +
              str(proposal_number))
    except:
        print("Run: " + str(run_number) + " not found, exiting")
        return
    # get the photon energy of the run
    # units are eV, rounded to 3 decimal places
    # the Germans had a try except for this, I could add that...
    #hv = round(float(r.get_array('SA3_XTD10_UND/DOOCS/PHOTON_ENERGY', 'actualPosition.value')[0]), 3)
    #print("Photon Energy for run "+str(run_number)+": "+str(hv)+ "eV")

    hv = r.get_array('SA3_XTD10_UND/DOOCS/PHOTON_ENERGY',
                     'actualPosition.value')
    plt.plot(hv)
    plt.show()
    #print(hv)
    return
Ejemplo n.º 7
0
def process_dssc_module(job):
    '''Aggregate DSSC data (chunked, to fit into memory) for a single module.
    Groups by "scan_variable" in given scanfile - use dummy scan_variable to average
    over all trains. This implies, that only trains found in the scanfile are considered.
    Designed for the multiprocessing module - expects a job dictionary with the following keys:
      proposal : (int) proposal number
      run : (int) run number
      module : (int) DSSC module to process
      chunksize : (int) number of trains to process simultaneously
      scanfile : (str) name of hdf5 file with xarray.DataArray containing the scan variable and trainIds
      framepattern : (list of str) names for the (possibly repeating) intra-train pulses. See split_dssc_data
      pulsemask : (str) name of hdf5 file with boolean xarray.DataArray to select/reject trains and pulses
    '''
    proposal = job['proposal']
    run_nr = job['run_nr']
    module = job['module']
    chunksize = job['chunksize']
    scanfile = job['scanfile']
    framepattern = job.get('framepattern', ['image'])
    maskfile = job.get('maskfile', None)

    sourcename = f'SCS_DET_DSSC1M-1/DET/{module}CH0:xtdf'

    collection = kd.open_run(proposal, run_nr, include=f'*DSSC{module:02d}*')

    ntrains = len(collection.train_ids)

    # read preprocessed scan variable from file - selection and (possibly) rounding already done.
    scan = xr.open_dataarray(scanfile, 'data', autoclose=True)

    # read binary pulse/train mask - e.g. from XGM thresholding
    if maskfile is not None:
        pulsemask = xr.open_dataarray(maskfile, 'data', autoclose=True)
    else:
        pulsemask = None

    module_data = prepare_module_empty(scan, framepattern)
    chunks = np.arange(ntrains, step=chunksize)
    if module == 15:
        # quick and dirty progress bar
        pbar = tqdm(total=len(chunks))
    for start_index in chunks:
        sel = collection.select_trains(kd.by_index[start_index:start_index +
                                                   chunksize])
        nframes = sel.detector_info(sourcename)['total_frames']
        if nframes > 0:  # some chunks have no DSSC data at all
            data = load_chunk_data(sel, sourcename)
            sum_count = xr.full_like(data[..., 0, 0], fill_value=1)
            if pulsemask is not None:
                data = data.where(pulsemask)
                sum_count = sum_count.where(pulsemask)

            data = split_frames(data, framepattern)
            sum_count = split_frames(sum_count,
                                     framepattern,
                                     prefix='sum_count_')
            data = xr.merge([data, sum_count])

            data[
                'scan_variable'] = scan  # aligns on trainId, drops non-matching trains
            data = data.groupby('scan_variable').sum('trainId')
            module_data = merge_chunk_data(module_data, data, framepattern)
        if module == 15:
            pbar.update(1)

    for name in framepattern:
        module_data[name] = module_data[name] / module_data['sum_count_' +
                                                            name]
    return module_data
Ejemplo n.º 8
0
def load_dssc_info(proposal, run_nr):
    '''Loads the first data file for DSSC module 0 (this is hardcoded) and
    returns the detector_info dictionary'''
    dssc_module = kd.open_run(proposal, run_nr, include='*DSSC00*')
    dssc_info = dssc_module.detector_info('SCS_DET_DSSC1M-1/DET/0CH0:xtdf')
    return dssc_info
Ejemplo n.º 9
0
Archivo: proc2.py Proyecto: sdunc/xfel
def save_peaks_per_pulse(run_number, starting_index, trains):
    '''    
    
    '''
    proposal_number = 2318  # The proposal number for this beamtime
    # see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html
    prominence = 100  # Prominence is an arbitrary threshhold used to select peaks. Edwin used 100.
    wlen = 20  # 'wlen' or window length helps speed up peak finding algorithm, see above link.
    # A run is composed of many(!) trains
    # a train is composed of many pulses
    # a pulse is composed of many photons
    every_x_pulse_in_train = 24  # no units, an amount
    t_min_period = 1760  # t_min_period is when pulses are in consecutive bunches. unit: ns/2
    t_zero = 18000  # train time offset unit: ns/2
    bin_size = 1  # unit for tof spectrum unit: ns/2
    pulse_time = every_x_pulse_in_train * t_min_period  # time for one pulse unit: ns/2

    try:
        r = open_run(proposal=proposal_number, run=run_number)
        print("Opened run: " + str(run_number) + "\tfor proposal: " +
              str(proposal_number))
    except:
        print("Run: " + str(run_number) + " not found, exiting")
        return

    local_dir = os.path.dirname('__file__')
    target_file = os.path.join(local_dir,
                               'run_' + str(run_number) + '_proc2.h5')

    # get all IDs in train
    train_IDs = r.train_ids
    number_of_trains = len(train_IDs)
    print(number_of_trains)

    # determine # of pulses per train, break loop once found
    intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output',
                                'data.intensitySa3TD')[0]
    for i in range(len(intensities_0)):
        if intensities_0[i] == 1.0:
            pulses_per_train = i
            print("Pulses per train: " + str(pulses_per_train))
            break

    # calculate constants for tof spectrum
    train_time = pulse_time * pulses_per_train  # time for the train unit: ns/2
    number_of_pulses = number_of_trains * pulses_per_train  # number of pulses in run

    # create empty array
    tof_count_per_pulse = []

    # how many traces (trains) to load into data at once
    # right now this code will only work with chunk_size = 1
    # which is really slow, but is needed for ions per pulse!
    # see Todo^
    chunk_size = 1
    f = h5.File(target_file, 'a')
    try:
        # create a group for all pulses
        f.create_group(
            '/tof_peaks_per_pulse'
        )  #, (number_of_pulses,range(0, pulse_time-1, bin_size)), dtype='int8', maxshape=(None,))
        f.close()
    except:
        print("group already made!")
        f.close()
    # iterate over each train in the run to avoid overallocating memory
    # used for naming! dont worry lol
    max_zeros = (len(str(number_of_trains))) * "0"

    for i in range(starting_index, starting_index + trains):
        if i > number_of_trains:
            # all done
            return
        else:
            i_shift = math.floor(i / chunk_size) * chunk_size
            sel = r.select_trains(by_index[i:i + chunk_size])
            trace_by_train = np.array(sel.get_array(
                'SQS_DIGITIZER_UTC1/ADC/1:network',
                'digitizers.channel_1_A.raw.samples',
                None,
                roi=by_index[t_zero:t_zero + train_time]),
                                      dtype=np.float64)[0]
            trace_by_train *= (-1)

            peaks = sci.signal.find_peaks(trace_by_train,
                                          prominence=prominence,
                                          wlen=wlen)
            # peaks[0]: indicies in single_train_traces of peaks!
            # peaks[1]: dictionary of properties

            # find out which pulse the peaks belongs to
            # divide the index within trace where a peak was found by the time of each pulse
            # pulse_of_peaks will look like [1,1,1,2,2,3,5,6,...]
            # an array with the same shape as peaks[0]
            # it takes the peaks found and generates an array of the same shape
            # except each index lists the pulse which that peak belongs to
            # t_min_period
            pulse_of_peaks = list(np.floor(peaks[0] / pulse_time).astype(int))

            # get the number of peaks per pulse
            # this many element will need to be removed from peaks[0]
            # and evaulated as that pulse
            n_peaks_per_pulse = np.bincount(pulse_of_peaks,
                                            minlength=pulses_per_train)

            # Wrap the time of flight of peaks[0]
            # create empty array same shape as all_peaks
            t_peaks = np.empty_like(peaks[0])
            # populate array with the modulus (remainder) of all_peaks and pulse_time
            # each element of all_peaks is the index within the single_train_traces which it came out of
            # that represents a 'peak' of the data
            # I am guessing that if the data is collected at a certain rate we can interact with it
            # in this way
            t_peaks = np.mod(peaks[0], pulse_time)
            # make all_peaks a list
            all_peaks = t_peaks.tolist()

            # next I need to split this up to a pulse by pulse basis
            f = h5.File(target_file, 'a')
            counter = 0
            for peak_count in n_peaks_per_pulse:
                abs_pulse_num = pulses_per_train * i + counter
                peaks_in_pulse = []
                for p in range(peak_count):
                    # get the 0th element p times
                    peaks_in_pulse.append(all_peaks.pop(0))
                peaks_in_pulse = np.array(peaks_in_pulse)
                group = f.get('tof_peaks_per_pulse')
                counter += 1
                # how many places of 0s total - how long the abs pulse num (pulse id) is
                zeros = len(str(number_of_trains)) - len(str(abs_pulse_num))
                added_zeros = "0" * zeros
                dataset_name = "pulse_" + str(added_zeros) + str(abs_pulse_num)
                try:
                    # save as unsigned int 16 to save space!
                    group.create_dataset(str(dataset_name),
                                         data=peaks_in_pulse,
                                         dtype='uint16')
                except:
                    print(
                        "Error! that pulse is already saved, check parameters."
                    )
                    print("i=", i)
                    pass
            f.close()
    return
Ejemplo n.º 10
0
Archivo: proc2.py Proyecto: sdunc/xfel
def save_intensity(run_number):
    '''
    Inputs:
        run_number: just the number of which run we are reducing
                    see comment on process_raw_run for more info
    Output:
        proc file is created if it does not exist, if it exists, the int is added.
    Todo:
        idk
    Note:
        This is part of me splitting up the process_raw_run 
        to get around my time restrictions on the cluster
    '''
    proposal_number = 2318  # The proposal number for this beamtime
    try:
        r = open_run(proposal=proposal_number, run=run_number)
        print("Opened run: " + str(run_number) + "\tfor proposal: " +
              str(proposal_number))
    except:
        print("Run: " + str(run_number) + " not found, exiting")
        return

    train_IDs = r.train_ids
    number_of_trains = len(train_IDs)
    print(number_of_trains)

    # determine the pulses per train
    intensities_0 = r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output',
                                'data.intensitySa3TD')[0]
    for i in range(len(intensities_0)):
        if intensities_0[i] == 1.0:
            pulses_per_train = i
            print("Pulses per train: " + str(pulses_per_train))
            break

    # 2D inensity list
    # region of interest is the each pulse x each train, which is also shape of array
    # get the intensity associated with each pulse within each train
    # xgm is X-Ray Gas Monitor
    # xgm has 1. Two beam intensity monitors to measure intensity along both x and y direction.
    # and 2. Two beam position monitors to determine x and y beam positions.
    # xgm is a 2d array with the shape (n trains, 1000)
    intensity_list = np.array(
        r.get_array('SA3_XTD10_XGM/XGM/DOOCS:output',
                    'data.intensitySa3TD',
                    roi=by_index[0:pulses_per_train])[0:number_of_trains])
    # and flatten the intensity list into a 1D array, so it has the same shape as peaks_in_pulse
    # that is, total number of pulses

    intensity_per_pulse = intensity_list.flatten()
    print(max(intensity_per_pulse))
    print(min(intensity_per_pulse))
    plt.hist(intensity_per_pulse, bins=500)
    plt.show()
    local_dir = os.path.dirname('__file__')
    target_file = os.path.join(local_dir,
                               'run_' + str(run_number) + '_proc2.h5')
    f = h5.File(target_file, 'a')
    try:
        f.create_dataset("intensity_per_pulse", data=intensity_per_pulse)
        f.close()
    except:
        print("Int already saved!")
        f.close()
        return
    return